Transformer.py

import torch 
import torch.nn as nn
import math

class InputEmbeddings(nn.Module):
    def __init__(self,d_model:int,vocab_size:int):
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size,d_model)

    def forward(self,x):            # !!
        return self.embedding(x) * math.sqrt(self.d_model)
        
        
class PositionalEncoding(nn.Module):
  def __init__(self,d_model:int,seq_len:int,dropout:float)->None:
      super().__init__()
      self.d_model = d_model
      self.seq_len = seq_len
      self.dropout = nn.Dropout(dropout)

      '''
      matriz de forma seq_len,d_model, siendo seq_len tamaño de la frase\n
      y d_model tamaño del vector representando a cada palabra.

      muchos vectores de longitud 512, el numero\n
      de vectores es igual a la cantidad de palabras en la frase
      '''

      matriz = torch.zeros(seq_len,d_model) 
      # vector representa la posicion de la palabra en la frase.
      pos = torch.arange(0,seq_len,dtype=torch.float).unsqueeze(1)
      #!! vector del denominador en positional encoding
      div_term = torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model))

      '''
      en la matriz que calculamos, a las posiciones pares\n
      se les saca el seno de su posicion multiplicado por el denominador\n
      a las posiciones impares se les saca el coseno de su posicion multiplicado por el denominador
      '''
      
      matriz[:,0::2] = torch.sin(pos*div_term)
      matriz[:,1::2]= torch.cos(pos*div_term)

      matriz = matriz.unsqueeze_(0)
      # guardar la matriz en el buffer 
      self.register_buffer('matriz',matriz)
  def forward(self,x):
      x = x + (self.pos[:,x.shape[1],:]).requires_grad_(False)
      return self.dropout(x)
  
  #layer normalization 
class LayerNormalization(nn.Module):
    def __init__(self,eps:float=10**-6)->None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self,x):
        mean = x.mean(dim=-1,keepdim=True)
        std = x.std(dim=-1,keepdim=True)
        return self.alpha *(x-mean)/(std+self.eps)+self.bias
    
class FeedForwardBlock(nn.Module):
    def __init__(self,d_model:int,d_ff:int,dropout:float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model,d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff,d_model)

    def forward(self,x):
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

    
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h 
        assert d_model % h == 0, 'error, not divisible'
        self.d_k = d_model // h
        self.W_Q = nn.Linear(d_model, d_model)
        self.W_K = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)

        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query,key,value,mask,dropout:nn.Dropout):
        d_k = query.shape[-1]
        attention_scores = (query @ key.transpose(-2,-1))/math.sqrt(d_k)

        if mask is not None:
            attention_scores.masked_fill_(mask == 0,-1e9)
        attention_scores = torch.softmax(dim=-1)

        if dropout is not None:
            attention_scores = dropout(attention_scores)

        return (attention_scores @ value),attention_scores

        
    def forward(self,q,k,v,mask):
        query = self.W_Q(q)# (Batch,seq_len,d_model) ---> (Batch,seq_len,d_model)
        key = self.W_K(k)
        value = self.W_V(v)
#(Batch,seq_len,d_model)  --->  (Batch,seq_len,h,d_k)  --->  (Batch,h,seq_len,d_k)
        query = query.view(query.shape[0],query.shape[1],self.h,self.d_k).transpose(1,2)
        key = key.view(key.shape[0],key.shape[1],self.h,self.d_k).transpose(1,2)
        value = value.view(value.shape[0],value.shape[1],self.h.self.d_k).transpose(1,2)

        x,self.attention_scores = MultiHeadAttentionBlock.attention(query,key,value,mask,self.dropout)

        x = x.transpose(1,2).contiguous().view(x.shape[0,-1,self.h*self.d_k])
        return self.w_o(x)

class ResidualConnection(nn.module):
    def __init__(self,dropout:float)->None:
        super.__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()
    def forward(self,x,sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

class EncoderBlock(nn.Module):
    def __init__(self,self_attention_block:MultiHeadAttentionBlock,feed_forward_block:FeedForwardBlock,dropout:float)->None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connection = nn.ModuleList([ResidualConnection(dropout)for _ in range(2)])
        #
    def forward(self,x,src_mask):
        x = self.residual_connections[0](x,lambda x: self.self_attention_block(x,x,x,src_mask))
        x = self.residual_connections[1](x,self.feed_forward_block)
        return x
    
class Encoder(nn.Module):
    def __init__(self,layers:nn.Module)->None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self,x,mask):
        for layer in self.layers:
            x = layer(x,mask)
        return self.norm(x)


class Decoderblock(nn.Module):
    def __init__(self,self_attention_block:MultiHeadAttentionBlock,cross_attention_block:MultiHeadAttentionBlock,feed_forward_block:FeedForwardBlock,dropout:float)->None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.Module([ResidualConnection(dropout) for _ in range(3)])
        #input of the decoder, output of the encoder, mask applied to the encoder, mask applied to the decoder
    def forward(self,x,encoder_output,src_mask,target_mask):
        x = self.residual_connections[0](x,lambda x: self.self_attention_block(x,x,x,target_mask))
        x = self.residual_connections[1](x,lambda x:self.cross_attention_block(x,encoder_output,encoder_output,src_mask))
        x = self.residual_connections[2](x,self.feed_forward_block)
        return x
    
class Decoder(nn.Module):
    def __init__(self,layers:nn.Module)->None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()
    def forward(self,x,encoder_output,src_mask,target_mask):
        for layer in self.layers:
            x = layer(x,encoder_output,src_mask,target_mask)
        return self.norm(x)
    
#convert the embedding to a position in the vocabulary 
    
    
class ProjectionLayer(nn.Module):
    def __init__(self,d_model:int,vocab_size:int)->None:
        super().__init__()
        self.proj = nn.Linear(d_model,vocab_size)
    def forward(self,x):
        #(batch,seq_len,d_model) ---> (batch,seq_len,vocab_size)
        return torch.log_softmax(self.proj(x),dim=-1)