Merge pull request #741 from douglasjacobsen/py-nemo-cache-create

Add nemo executable to create transformer cache
GoogleCloudPlatform · Nov 6, 2024 · 88834df · 88834df
2 parents f39fd59 + 376b7b8
commit 88834df
Showing 1 changed file with 41 additions and 31 deletions.
diff --git a/var/ramble/repos/builtin/applications/py-nemo/application.py b/var/ramble/repos/builtin/applications/py-nemo/application.py
@@ -27,10 +27,15 @@ class PyNemo(ExecutableApplication):
 
     tags("ml-framework", "machine-learning")
 
+    executable(
+        "setup_transformer_cache",
+        'bash -c "python3 -c \'from transformers import AutoTokenizer; AutoTokenizer.from_pretrained(\\"gpt2\\")\'"',
+        use_mpi=True,
+    )
+
     executable(
         "pretraining_exec",
-        'bash -c "cd /opt/NeMo; git rev-parse HEAD; export PYTHONPATH=/opt/NeMo:\${PYTHONPATH}; '
-        "CUDA_VISIBLE_DEVICES={cuda_visible_devices} "
+        'bash -c "cd /opt/NeMo; git rev-parse HEAD; '
         "python3 -u /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py "
         '--config-path={nemo_generated_config_path} --config-name={nemo_generated_config_name}"',
         use_mpi=True,
@@ -50,7 +55,11 @@ class PyNemo(ExecutableApplication):
 
     workload(
         "pretraining",
-        executables=["create_logs", "pretraining_exec"],
+        executables=[
+            "create_logs",
+            "setup_transformer_cache",
+            "pretraining_exec",
+        ],
         inputs=["nemo_fetched_config"],
     )
 
@@ -1361,38 +1370,39 @@ def _preprocess_log(self, workspace, app_inst):
 
         final_regex = re.compile(self.final_epoch_regex)
 
-        with open(log_file, "r", encoding="ISO-8859-1") as f:
-            data = f.read()
-
-        with open(log_file, "r", encoding="ISO-8859-1") as f:
-            for line in f.readlines():
-                m = final_regex.match(line)
+        if os.path.exists(log_file):
+            with open(log_file, "r", encoding="ISO-8859-1") as f:
+                data = f.read()
 
-                if m:
-                    timestamp = m.group("elapsed_time")
+            with open(log_file, "r", encoding="ISO-8859-1") as f:
+                for line in f.readlines():
+                    m = final_regex.match(line)
 
-                    time_parts = timestamp.split(":")
+                    if m:
+                        timestamp = m.group("elapsed_time")
 
-                    part_s = 0
-                    mult = 1
-                    for part in reversed(time_parts):
-                        part_s += int(part) * mult
-                        mult = mult * 60
-                    elapsed_s += part_s
+                        time_parts = timestamp.split(":")
 
-        processed_log = self.expander.expand_var(
-            "{experiment_run_dir}/processed_{experiment_name}.out"
-        )
+                        part_s = 0
+                        mult = 1
+                        for part in reversed(time_parts):
+                            part_s += int(part) * mult
+                            mult = mult * 60
+                        elapsed_s += part_s
 
-        with open(processed_log, "w+") as f:
-            f.write(
-                data.replace("\x13", "\n")
-                .replace("\x96\x88", "")
-                .replace("â", "")
+            processed_log = self.expander.expand_var(
+                "{experiment_run_dir}/processed_{experiment_name}.out"
             )
 
-        sec_file_path = self.expander.expand_var(
-            "{experiment_run_dir}/elapsed_seconds"
-        )
-        with open(sec_file_path, "w+") as f:
-            f.write(f"Elapsed seconds: {elapsed_s}")
+            with open(processed_log, "w+") as f:
+                f.write(
+                    data.replace("\x13", "\n")
+                    .replace("\x96\x88", "")
+                    .replace("â", "")
+                )
+
+            sec_file_path = self.expander.expand_var(
+                "{experiment_run_dir}/elapsed_seconds"
+            )
+            with open(sec_file_path, "w+") as f:
+                f.write(f"Elapsed seconds: {elapsed_s}")