diff --git a/nemo/collections/multimodal/modules/speechllm_perception.py b/nemo/collections/multimodal/modules/speechllm_perception.py
index 3745c5971c73..a656646bac63 100644
--- a/nemo/collections/multimodal/modules/speechllm_perception.py
+++ b/nemo/collections/multimodal/modules/speechllm_perception.py
@@ -68,7 +68,10 @@ def __init__(self, cfg: DictConfig):
         # Initialize components
         self.preprocessor = self.from_config_dict(cfg.preprocessor)
         self.encoder = self.from_config_dict(cfg.encoder)
-        self.spec_augmentation = self.from_config_dict(cfg.spec_augment)
+        if 'spec_augment' in cfg and cfg.spec_augment is not None:
+            self.spec_augmentation = self.from_config_dict(cfg.spec_augment)
+        else:
+            self.spec_augmentation = None
         self.matcher = self.from_config_dict(cfg.matcher)
         self.proj = nn.Linear(cfg.matcher.d_model, cfg.output_dim)