diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 5565a08..e2b831e 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -1,6 +1,13 @@
 Change Log
 ==========
 
+0.4.9 (22/01/2023)
+-----------------
+* Added assert to Attention class (from extensions) when mask is used
+* Fixed confusion matrix cpu/gpu device error
+* Better handling on callbacks where apply_on_states=None (apply on all states)
+* Updated Pipfile
+
 
 0.4.8 (15/09/2022)
 -----------------
diff --git a/Pipfile b/Pipfile
index 8293cfd..b94e60b 100644
--- a/Pipfile
+++ b/Pipfile
@@ -4,8 +4,8 @@ url = "https://pypi.org/simple"
 verify_ssl = true
 
 [dev-packages]
-tensorboard = "==2.3.0"
-tqdm = "==4.51.0"
+tensorboard = "*"
+tqdm = "*"
 
 [packages]
 numpy = "*"
@@ -14,4 +14,4 @@ torchvision = "*"
 protobuf = "==3.20.*"
 
 [requires]
-python_version = "3.7.6"
+python_version = "3.9.1"
diff --git a/README.md b/README.md
index b878bb4..a882beb 100644
--- a/README.md
+++ b/README.md
@@ -25,9 +25,11 @@ There are 2 types of ``lpd`` packagaes available
     pip install lpd-nodeps
 ```
 
-<b>[v0.4.8-beta](https://github.com/RoySadaka/lpd/releases) Release - contains the following:</b>
-* Added AbsoluteThresholdChecker & RelativeThresholdChecker classes
-* ThresholdCheckers can now be used in CallbackMonitor to better define metric tracking
+<b>[v0.4.9-beta](https://github.com/RoySadaka/lpd/releases) Release - contains the following:</b>
+* Added assert to Attention class (from extensions) when mask is used
+* Fixed confusion matrix cpu/gpu device error
+* Better handling on callbacks where apply_on_states=None (apply on all states)
+* Updated Pipfile
 
 
 Previously on lpd: 
diff --git a/examples/multiple_inputs/model.py b/examples/multiple_inputs/model.py
index 1183d8c..4359686 100644
--- a/examples/multiple_inputs/model.py
+++ b/examples/multiple_inputs/model.py
@@ -1,4 +1,4 @@
-import torch as T
+import torch
 import torch.nn as nn
 import torch.optim as optim
 
@@ -30,11 +30,11 @@ def __init__(self, config, num_embeddings):
                                                             drop_out_proba=config.TRANSFORMER_DROP_OUT_PROBA,
                                                             ff_expansion_rate=config.TRANSFORMER_FF_EXPANSION_RATE)
 
-        self.external_query_attention = Attention(key_dim=config.EMBEDDINGS_SIZE, use_query_dense=True)
+        self.external_query_attention = Attention()
         self.norm = nn.LayerNorm(normalized_shape=config.EMBEDDINGS_SIZE) # WILL APPLY NORM OVER THE LAST DIMENTION ONLY
         self.mat_mul2d = MatMul2D(transpose_b=True)
 
-    def forward(self, x1, x2, x3):
+    def forward(self, x1, x2, x3, index_select_aux):
         # x1   : sequence-Input  	(batch, num_elements)
         # x2   : some1-Input        (batch, 1)
         # x3   : some2-Input        (batch, 1)
@@ -43,14 +43,17 @@ def forward(self, x1, x2, x3):
         x1_emb_transformed = self.transformer_encoder(x1_emb)                                 # (batch, num_elements, emb_size)
         
         x3_emb = self.embedding_layer(x3)                                                     # (batch, emb_size)
-        x3_emb_unsqueesed = x3_emb.unsqueeze(1)                                               # (batch, 1, emb_size)
+        x3_emb_unsqueeze = x3_emb.unsqueeze(1)                                               # (batch, 1, emb_size)
 
-        x1_with_x3_reduced = self.external_query_attention(q=x3_emb_unsqueesed, 
-                                                           k=x1_emb_transformed, 
-                                                           v=x1_emb_transformed)              # (batch, 1, emb_size)
+        x1_with_x3_reduced = torch.cat([x3_emb_unsqueeze, x1_emb_transformed], dim=1)          # (batch, num_elements+1, emb_size)
+
+        x1_with_x3_reduced = self.external_query_attention(q=x1_with_x3_reduced, 
+                                                           k=x1_with_x3_reduced, 
+                                                           v=x1_with_x3_reduced)              # (batch, num_elements+1, emb_size)
         
+        x1_with_x3_reduced = torch.index_select(x1_with_x3_reduced, dim=1, index=index_select_aux) # (batch, 1, emb_size)
 
-        x1_with_x3_residual = self.norm(x1_with_x3_reduced + x3_emb_unsqueesed)     		  # (batch, 1, emb_size)
+        x1_with_x3_residual = self.norm(x1_with_x3_reduced + x3_emb_unsqueeze)     		  # (batch, 1, emb_size)
 
         x2_emb = self.embedding_layer(x2)                                                     # (batch, emb_size)
 
diff --git a/examples/multiple_inputs/train.py b/examples/multiple_inputs/train.py
index 335b475..cf89980 100644
--- a/examples/multiple_inputs/train.py
+++ b/examples/multiple_inputs/train.py
@@ -1,5 +1,5 @@
 import random
-import torch as T
+import torch
 import os 
 from .config import Config
 from .model import get_trainer
@@ -12,7 +12,8 @@ def prepare_chunk_to_model_input(config, chunk):
     x2 = [c[config.IDX_OF_X2] for c in chunk]
     x3 = [c[config.IDX_OF_X3] for c in chunk]
     y = [c[config.IDX_OF_LABEL] for c in chunk]
-    return [T.LongTensor(x1), T.LongTensor(x2), T.LongTensor(x3)], T.Tensor(y)
+    index_select_aux = torch.LongTensor([0])
+    return [torch.LongTensor(x1), torch.LongTensor(x2), torch.LongTensor(x3), index_select_aux], torch.Tensor(y)
 
 def get_data_stats(data_generator, verbose=1):
     sanity_count = int(1e6)
diff --git a/lpd/callbacks/callback_base.py b/lpd/callbacks/callback_base.py
index 0783536..5fba3d1 100644
--- a/lpd/callbacks/callback_base.py
+++ b/lpd/callbacks/callback_base.py
@@ -80,7 +80,8 @@ def _extract_apply_on_states(self, apply_on_states):
                     raise ValueError(f'[CallbackBase] - {s} is of type {type(s)}, expected type {State}')
             return result
         elif apply_on_states is None:
-            result.add(apply_on_states)
+            for state in State:
+                result.add(state) 
             return result
 
         raise ValueError(f'[CallbackBase] - got bad value for apply_on_states')
@@ -88,8 +89,8 @@ def _extract_apply_on_states(self, apply_on_states):
     def _validations(self):
         if self.apply_on_phase is None:
             raise ValueError('[CallbackBase] - No callback phase was provided')
-        if None in self.apply_on_states:
-            print('[CallbackBase][!] - apply_on_states is None, callback will be applied to all states')
+        if self.apply_on_states is None:
+            print('[CallbackBase] - apply_on_states is None, callback will be applied to all states')
 
         valid_pairs = {
                         Phase.TRAIN_BEGIN:{None, State.EXTERNAL}, 
@@ -130,11 +131,4 @@ def should_apply_on_phase(self, callback_context: CallbackContext):
         raise ValueError('[CallbackBase] - got bad value for apply_on_phase')
 
     def should_apply_on_state(self, callback_context: CallbackContext):
-        if None in self.apply_on_states:
-            return True
-
-        for state in self.apply_on_states:
-            if callback_context.trainer_state == state:
-                return True
-
-        return False
\ No newline at end of file
+        return callback_context.trainer_state in self.apply_on_states
\ No newline at end of file
diff --git a/lpd/extensions/custom_layers.py b/lpd/extensions/custom_layers.py
index 9efef39..d0ac4d6 100644
--- a/lpd/extensions/custom_layers.py
+++ b/lpd/extensions/custom_layers.py
@@ -41,63 +41,53 @@ def forward(self, inputs):
 class Attention(nn.Module):
     """
         The architecture is based on the paper “Attention Is All You Need”
-        Usage (1)
-        It can be used as Attention in transformer if q,k,v share the same dimensions.
-        
-        Usage (2)
-        It can also be used as a method to aggregate a group of vectors into 1 vector if q dimensions are (batch, 1, key_dim)
-        that way, instead of using Sum, or Average, you can have a learnable query vector (or a few of them) that will learn the aggregation function.
-        See example in lpd.examples.multiple_inputs.model, where we define external_query_attention like so:
-        external_query_attention = Attention(key_dim=config.EMBEDDINGS_SIZE, use_query_dense=True)  
+        Used as the Attention layer in transformer.
 
         Args:
         key_dim - as defined in the paper, the number of expected features in the encoder inputs
-        use_query_dense - whether to pass q input into another Dense layer, mostly used in Usage (2), to
-                          run q into a transformation that will transform it into the vector space of k and v
         name - optional, any string to describe this layer
     """
-    def __init__(self, key_dim, use_query_dense=False, name=None):
+    def __init__(self, name=None):
         super(Attention, self).__init__()
         #PARAMS
-        self.key_dim            = key_dim
-        self.sqrt_key_dim       = key_dim ** 0.5
-        self.use_query_dense    = use_query_dense
         self.name               = name if name else 'attention'
         #LAYERS
         self.mat_mul2d          = MatMul2D(transpose_b=False, name = f'{self.name}__MatMul2D')
         self.mat_mul2d_t        = MatMul2D(transpose_b=True, name = f'{self.name}__MatMul2DT')
         self.softmax_last_dim   = nn.Softmax(dim=-1)
-        if self.use_query_dense:
-            # SOMETIMES WE WANT TO GO THROUGH ANOTHER TRANSFORMATION BEFORE RUNNING THE QUERY,
-            # FOR EXAMPLE, WHEN THIS IS USED AS A STANDALONE LAYER
-            self.query_dense    = Dense(in_dim=self.key_dim, out_dim=self.key_dim, use_bias=False, activation=None, name = f'{self.name}__Dense')
 
     def forward(self, q,k,v, mask = None):
-        # q:    (batch, ?, key_dim)             where "?" can be 1 or seq_len
-        # k:    (batch, seq_len, key_dim)
-        # v:    (batch, seq_len, key_dim)
-        # mask: (batch, 1, seq_len)
+        # q:    (batch, seq_len, emb_dim)
+        # k:    (batch, seq_len, emb_dim)
+        # v:    (batch, seq_len, emb_dim)
+        # mask: (batch, seq_len)
 
         # APPLY ATTENTION:
         #                       (     Q * Kt     )
         #               softmax (   ----------   ) * V
         #                       (    sqrt(dk)    )
 
-        if self.use_query_dense:
-            q = self.query_dense(q)                                            # (batch, seq_len, key_dim)
+        if mask is not None:
+            assert q.shape == k.shape == v.shape, 'Dimensions mismatch, When using mask it is expected that the shape of q,k,v will be identical'
 
-        q_k = self.mat_mul2d_t(q, k)                                           # (batch, ?, seq_len)
-        scores = q_k / self.sqrt_key_dim                                       # (batch, ?, seq_len)
+        emb_dim = q.shape[-1]
+        q = q / (emb_dim ** 0.5)                                                # (batch, seq_len, emb_dim)
+        q_k = self.mat_mul2d_t(q, k)                                            # (batch, seq_len, seq_len)
 
         if mask is not None:
-            mask_ready = torch.log(mask)                                       # (batch, 1, seq_len)
-            scores = scores + mask_ready                                       # (batch, ?, seq_len) (+= is doing broadcasting)
+            # PREPARE MASK FOR SOFTMAX ON COLUMNS, WILL ZERO OUT MASKED COLUMNS
+            mask_ready = torch.log(mask).unsqueeze(-2)                          # (batch, 1, seq_len)
+            q_k = q_k + mask_ready                                              # (batch, seq_len, seq_len)  (broadcasting op)
+
+        attention_weights = self.softmax_last_dim(q_k)                          # (batch, seq_len, seq_len)
 
-        attention_weights = self.softmax_last_dim(scores)                      # (batch, ?, seq_len)
-        
-        attention_output = self.mat_mul2d(attention_weights, v)                # (batch, ?, key_dim)
+        attention_output = self.mat_mul2d(attention_weights, v)                 # (batch, seq_len, emb_dim)
 
-        return attention_output                                                # (batch, ?, key_dim)
+        if mask is not None:
+            # A CLEAN UP THAT WILL RESTORE MASKED ROWS TO THEIR ORIGINAL VALUES
+            attention_output = (attention_output * mask.unsqueeze(-1)) + (q * (1-mask).unsqueeze(-1)) # (batch, seq_len, emb_dim)
+
+        return attention_output                                                # (batch, seq_len, emb_dim)
 
 class AttentionHead(nn.Module):
     def __init__(self, in_dim, key_dim, name=None):
@@ -112,7 +102,7 @@ def __init__(self, in_dim, key_dim, name=None):
         self.query_dense  = Dense(self.in_dim, self.key_dim, use_bias=True, activation=None, name = f'{self.name}__Q-Dense')
         self.key_dense    = Dense(self.in_dim, self.key_dim, use_bias=True, activation=None, name = f'{self.name}__K-Dense')
         self.value_dense  = Dense(self.in_dim, self.key_dim, use_bias=True, activation=None, name = f'{self.name}__V-Dense')
-        self.att          = Attention(self.key_dim, name = f'{self.name}__Attention')
+        self.att          = Attention(name = f'{self.name}__Attention')
 
     def forward(self, inputs, mask = None):     # inputs:(batch, seq_len, emb_size), mask:(batch, seq_len)
         q = self.query_dense(inputs)            # (batch, seq_len, key_dim)
@@ -282,4 +272,3 @@ def forward(self, inputs, mask=None):
         for encoder_layer in self.transformer_blocks:
             outputs = encoder_layer(inputs=outputs, mask=mask)
         return outputs                                # (batch, seq_len, out_dim)   <-- USUALLY out_dim = emb_size
-
diff --git a/lpd/metrics/confusion_matrix.py b/lpd/metrics/confusion_matrix.py
index 4b938de..51f7f51 100644
--- a/lpd/metrics/confusion_matrix.py
+++ b/lpd/metrics/confusion_matrix.py
@@ -148,4 +148,5 @@ def update_state(self, y_pred: T.Tensor, y_true: T.Tensor):
         y_true_class_idxs = y_true.long()
 
         for row, col in zip(y_pred_class_idxs, y_true_class_idxs):
-            self.confusion[row][col] += 1
+            self.confusion[row.cpu()][col.cpu()] += 1
+
diff --git a/setup-nodeps.py b/setup-nodeps.py
index 21185a6..262f020 100644
--- a/setup-nodeps.py
+++ b/setup-nodeps.py
@@ -27,16 +27,14 @@
         'License :: OSI Approved :: MIT License',
         'Operating System :: OS Independent',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
         'Topic :: Utilities'
     ]
 
 setup(
     name='lpd-nodeps',
-    version='0.4.8',
+    version='0.4.9',
     description='A Fast, Flexible Trainer with Callbacks and Extensions for PyTorch',
     long_description_content_type='text/markdown',
     long_description=README_md,
@@ -47,7 +45,7 @@
     maintainer_email='torch.lpd@gmail.com',
     packages=find_packages(exclude=['tests', 'tests/*', 'examples', 'examples/*']),
     install_requires=install_requires_nodeps,
-    python_requires='>=3.6',
+    python_requires='>=3.9',
     classifiers=classifiers,
     keywords=['lpd-nodeps']
 )
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d21f203..d9fe940 100644
--- a/setup.py
+++ b/setup.py
@@ -29,16 +29,14 @@
         'License :: OSI Approved :: MIT License',
         'Operating System :: OS Independent',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
         'Topic :: Utilities'
     ]
 
 setup(
     name='lpd',
-    version='0.4.8',
+    version='0.4.9',
     description='A Fast, Flexible Trainer with Callbacks and Extensions for PyTorch',
     long_description_content_type='text/markdown',
     long_description=README_md,
@@ -49,7 +47,7 @@
     maintainer_email='torch.lpd@gmail.com',
     packages=find_packages(exclude=['tests', 'tests/*', 'examples', 'examples/*']),
     install_requires=install_requires,
-    python_requires='>=3.6',
+    python_requires='>=3.9',
     classifiers=classifiers,
     keywords=['pytorch,trainer,callback,callbacks,earlystopping,tensorboard,modelcheckpoint,checkpoint,layers,dense,metrics,predictor,binary accuracy,extensions,track,monitor,machine,deep learning,neural,networks,AI,keras decay,confusion matrix']
 )
\ No newline at end of file