diff --git a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py index 5d2c2376ebb5..ab33532c3c1f 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py @@ -298,7 +298,7 @@ def encode(self, x): class BERTTokenizer(AbstractEncoder): - """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)""" + """Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)""" def __init__(self, device="cuda", vq_interface=True, max_length=77): super().__init__() @@ -530,7 +530,10 @@ def __init__( print(f"Downloading clip with", arch, version, cache_dir) self.device = device model, _, _ = open_clip.create_model_and_transforms( - arch, device=torch.device("cpu"), pretrained=version, cache_dir=cache_dir, + arch, + device=torch.device("cpu"), + pretrained=version, + cache_dir=cache_dir, ) del model.visual self.model = model @@ -669,7 +672,11 @@ def build_tokenizer(self, cfg): legacy=legacy, ) - _, self.text_transform = get_preprocess_fns(cfg, self.tokenizer, is_train=False,) + _, self.text_transform = get_preprocess_fns( + cfg, + self.tokenizer, + is_train=False, + ) self.max_length = cfg.text.get("max_position_embeddings") def load_model(self, cfg, state_dict): @@ -764,7 +771,11 @@ def __init__( super().__init__() assert layer in self.LAYERS self.projection_dim = 1280 - model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device("cpu"), pretrained=version,) + model, _, _ = open_clip.create_model_and_transforms( + arch, + device=torch.device("cpu"), + pretrained=version, + ) del model.visual self.model = model