diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 5565a08..e2b831e 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -1,6 +1,13 @@
Change Log
==========
+0.4.9 (22/01/2023)
+-----------------
+* Added assert to Attention class (from extensions) when mask is used
+* Fixed confusion matrix cpu/gpu device error
+* Better handling on callbacks where apply_on_states=None (apply on all states)
+* Updated Pipfile
+
0.4.8 (15/09/2022)
-----------------
diff --git a/Pipfile b/Pipfile
index 8293cfd..b94e60b 100644
--- a/Pipfile
+++ b/Pipfile
@@ -4,8 +4,8 @@ url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
-tensorboard = "==2.3.0"
-tqdm = "==4.51.0"
+tensorboard = "*"
+tqdm = "*"
[packages]
numpy = "*"
@@ -14,4 +14,4 @@ torchvision = "*"
protobuf = "==3.20.*"
[requires]
-python_version = "3.7.6"
+python_version = "3.9.1"
diff --git a/README.md b/README.md
index b878bb4..a882beb 100644
--- a/README.md
+++ b/README.md
@@ -25,9 +25,11 @@ There are 2 types of ``lpd`` packagaes available
pip install lpd-nodeps
```
-[v0.4.8-beta](https://github.com/RoySadaka/lpd/releases) Release - contains the following:
-* Added AbsoluteThresholdChecker & RelativeThresholdChecker classes
-* ThresholdCheckers can now be used in CallbackMonitor to better define metric tracking
+[v0.4.9-beta](https://github.com/RoySadaka/lpd/releases) Release - contains the following:
+* Added assert to Attention class (from extensions) when mask is used
+* Fixed confusion matrix cpu/gpu device error
+* Better handling on callbacks where apply_on_states=None (apply on all states)
+* Updated Pipfile
Previously on lpd:
diff --git a/examples/multiple_inputs/model.py b/examples/multiple_inputs/model.py
index 1183d8c..4359686 100644
--- a/examples/multiple_inputs/model.py
+++ b/examples/multiple_inputs/model.py
@@ -1,4 +1,4 @@
-import torch as T
+import torch
import torch.nn as nn
import torch.optim as optim
@@ -30,11 +30,11 @@ def __init__(self, config, num_embeddings):
drop_out_proba=config.TRANSFORMER_DROP_OUT_PROBA,
ff_expansion_rate=config.TRANSFORMER_FF_EXPANSION_RATE)
- self.external_query_attention = Attention(key_dim=config.EMBEDDINGS_SIZE, use_query_dense=True)
+ self.external_query_attention = Attention()
self.norm = nn.LayerNorm(normalized_shape=config.EMBEDDINGS_SIZE) # WILL APPLY NORM OVER THE LAST DIMENTION ONLY
self.mat_mul2d = MatMul2D(transpose_b=True)
- def forward(self, x1, x2, x3):
+ def forward(self, x1, x2, x3, index_select_aux):
# x1 : sequence-Input (batch, num_elements)
# x2 : some1-Input (batch, 1)
# x3 : some2-Input (batch, 1)
@@ -43,14 +43,17 @@ def forward(self, x1, x2, x3):
x1_emb_transformed = self.transformer_encoder(x1_emb) # (batch, num_elements, emb_size)
x3_emb = self.embedding_layer(x3) # (batch, emb_size)
- x3_emb_unsqueesed = x3_emb.unsqueeze(1) # (batch, 1, emb_size)
+ x3_emb_unsqueeze = x3_emb.unsqueeze(1) # (batch, 1, emb_size)
- x1_with_x3_reduced = self.external_query_attention(q=x3_emb_unsqueesed,
- k=x1_emb_transformed,
- v=x1_emb_transformed) # (batch, 1, emb_size)
+ x1_with_x3_reduced = torch.cat([x3_emb_unsqueeze, x1_emb_transformed], dim=1) # (batch, num_elements+1, emb_size)
+
+ x1_with_x3_reduced = self.external_query_attention(q=x1_with_x3_reduced,
+ k=x1_with_x3_reduced,
+ v=x1_with_x3_reduced) # (batch, num_elements+1, emb_size)
+ x1_with_x3_reduced = torch.index_select(x1_with_x3_reduced, dim=1, index=index_select_aux) # (batch, 1, emb_size)
- x1_with_x3_residual = self.norm(x1_with_x3_reduced + x3_emb_unsqueesed) # (batch, 1, emb_size)
+ x1_with_x3_residual = self.norm(x1_with_x3_reduced + x3_emb_unsqueeze) # (batch, 1, emb_size)
x2_emb = self.embedding_layer(x2) # (batch, emb_size)
diff --git a/examples/multiple_inputs/train.py b/examples/multiple_inputs/train.py
index 335b475..cf89980 100644
--- a/examples/multiple_inputs/train.py
+++ b/examples/multiple_inputs/train.py
@@ -1,5 +1,5 @@
import random
-import torch as T
+import torch
import os
from .config import Config
from .model import get_trainer
@@ -12,7 +12,8 @@ def prepare_chunk_to_model_input(config, chunk):
x2 = [c[config.IDX_OF_X2] for c in chunk]
x3 = [c[config.IDX_OF_X3] for c in chunk]
y = [c[config.IDX_OF_LABEL] for c in chunk]
- return [T.LongTensor(x1), T.LongTensor(x2), T.LongTensor(x3)], T.Tensor(y)
+ index_select_aux = torch.LongTensor([0])
+ return [torch.LongTensor(x1), torch.LongTensor(x2), torch.LongTensor(x3), index_select_aux], torch.Tensor(y)
def get_data_stats(data_generator, verbose=1):
sanity_count = int(1e6)
diff --git a/lpd/callbacks/callback_base.py b/lpd/callbacks/callback_base.py
index 0783536..5fba3d1 100644
--- a/lpd/callbacks/callback_base.py
+++ b/lpd/callbacks/callback_base.py
@@ -80,7 +80,8 @@ def _extract_apply_on_states(self, apply_on_states):
raise ValueError(f'[CallbackBase] - {s} is of type {type(s)}, expected type {State}')
return result
elif apply_on_states is None:
- result.add(apply_on_states)
+ for state in State:
+ result.add(state)
return result
raise ValueError(f'[CallbackBase] - got bad value for apply_on_states')
@@ -88,8 +89,8 @@ def _extract_apply_on_states(self, apply_on_states):
def _validations(self):
if self.apply_on_phase is None:
raise ValueError('[CallbackBase] - No callback phase was provided')
- if None in self.apply_on_states:
- print('[CallbackBase][!] - apply_on_states is None, callback will be applied to all states')
+ if self.apply_on_states is None:
+ print('[CallbackBase] - apply_on_states is None, callback will be applied to all states')
valid_pairs = {
Phase.TRAIN_BEGIN:{None, State.EXTERNAL},
@@ -130,11 +131,4 @@ def should_apply_on_phase(self, callback_context: CallbackContext):
raise ValueError('[CallbackBase] - got bad value for apply_on_phase')
def should_apply_on_state(self, callback_context: CallbackContext):
- if None in self.apply_on_states:
- return True
-
- for state in self.apply_on_states:
- if callback_context.trainer_state == state:
- return True
-
- return False
\ No newline at end of file
+ return callback_context.trainer_state in self.apply_on_states
\ No newline at end of file
diff --git a/lpd/extensions/custom_layers.py b/lpd/extensions/custom_layers.py
index 9efef39..d0ac4d6 100644
--- a/lpd/extensions/custom_layers.py
+++ b/lpd/extensions/custom_layers.py
@@ -41,63 +41,53 @@ def forward(self, inputs):
class Attention(nn.Module):
"""
The architecture is based on the paper “Attention Is All You Need”
- Usage (1)
- It can be used as Attention in transformer if q,k,v share the same dimensions.
-
- Usage (2)
- It can also be used as a method to aggregate a group of vectors into 1 vector if q dimensions are (batch, 1, key_dim)
- that way, instead of using Sum, or Average, you can have a learnable query vector (or a few of them) that will learn the aggregation function.
- See example in lpd.examples.multiple_inputs.model, where we define external_query_attention like so:
- external_query_attention = Attention(key_dim=config.EMBEDDINGS_SIZE, use_query_dense=True)
+ Used as the Attention layer in transformer.
Args:
key_dim - as defined in the paper, the number of expected features in the encoder inputs
- use_query_dense - whether to pass q input into another Dense layer, mostly used in Usage (2), to
- run q into a transformation that will transform it into the vector space of k and v
name - optional, any string to describe this layer
"""
- def __init__(self, key_dim, use_query_dense=False, name=None):
+ def __init__(self, name=None):
super(Attention, self).__init__()
#PARAMS
- self.key_dim = key_dim
- self.sqrt_key_dim = key_dim ** 0.5
- self.use_query_dense = use_query_dense
self.name = name if name else 'attention'
#LAYERS
self.mat_mul2d = MatMul2D(transpose_b=False, name = f'{self.name}__MatMul2D')
self.mat_mul2d_t = MatMul2D(transpose_b=True, name = f'{self.name}__MatMul2DT')
self.softmax_last_dim = nn.Softmax(dim=-1)
- if self.use_query_dense:
- # SOMETIMES WE WANT TO GO THROUGH ANOTHER TRANSFORMATION BEFORE RUNNING THE QUERY,
- # FOR EXAMPLE, WHEN THIS IS USED AS A STANDALONE LAYER
- self.query_dense = Dense(in_dim=self.key_dim, out_dim=self.key_dim, use_bias=False, activation=None, name = f'{self.name}__Dense')
def forward(self, q,k,v, mask = None):
- # q: (batch, ?, key_dim) where "?" can be 1 or seq_len
- # k: (batch, seq_len, key_dim)
- # v: (batch, seq_len, key_dim)
- # mask: (batch, 1, seq_len)
+ # q: (batch, seq_len, emb_dim)
+ # k: (batch, seq_len, emb_dim)
+ # v: (batch, seq_len, emb_dim)
+ # mask: (batch, seq_len)
# APPLY ATTENTION:
# ( Q * Kt )
# softmax ( ---------- ) * V
# ( sqrt(dk) )
- if self.use_query_dense:
- q = self.query_dense(q) # (batch, seq_len, key_dim)
+ if mask is not None:
+ assert q.shape == k.shape == v.shape, 'Dimensions mismatch, When using mask it is expected that the shape of q,k,v will be identical'
- q_k = self.mat_mul2d_t(q, k) # (batch, ?, seq_len)
- scores = q_k / self.sqrt_key_dim # (batch, ?, seq_len)
+ emb_dim = q.shape[-1]
+ q = q / (emb_dim ** 0.5) # (batch, seq_len, emb_dim)
+ q_k = self.mat_mul2d_t(q, k) # (batch, seq_len, seq_len)
if mask is not None:
- mask_ready = torch.log(mask) # (batch, 1, seq_len)
- scores = scores + mask_ready # (batch, ?, seq_len) (+= is doing broadcasting)
+ # PREPARE MASK FOR SOFTMAX ON COLUMNS, WILL ZERO OUT MASKED COLUMNS
+ mask_ready = torch.log(mask).unsqueeze(-2) # (batch, 1, seq_len)
+ q_k = q_k + mask_ready # (batch, seq_len, seq_len) (broadcasting op)
+
+ attention_weights = self.softmax_last_dim(q_k) # (batch, seq_len, seq_len)
- attention_weights = self.softmax_last_dim(scores) # (batch, ?, seq_len)
-
- attention_output = self.mat_mul2d(attention_weights, v) # (batch, ?, key_dim)
+ attention_output = self.mat_mul2d(attention_weights, v) # (batch, seq_len, emb_dim)
- return attention_output # (batch, ?, key_dim)
+ if mask is not None:
+ # A CLEAN UP THAT WILL RESTORE MASKED ROWS TO THEIR ORIGINAL VALUES
+ attention_output = (attention_output * mask.unsqueeze(-1)) + (q * (1-mask).unsqueeze(-1)) # (batch, seq_len, emb_dim)
+
+ return attention_output # (batch, seq_len, emb_dim)
class AttentionHead(nn.Module):
def __init__(self, in_dim, key_dim, name=None):
@@ -112,7 +102,7 @@ def __init__(self, in_dim, key_dim, name=None):
self.query_dense = Dense(self.in_dim, self.key_dim, use_bias=True, activation=None, name = f'{self.name}__Q-Dense')
self.key_dense = Dense(self.in_dim, self.key_dim, use_bias=True, activation=None, name = f'{self.name}__K-Dense')
self.value_dense = Dense(self.in_dim, self.key_dim, use_bias=True, activation=None, name = f'{self.name}__V-Dense')
- self.att = Attention(self.key_dim, name = f'{self.name}__Attention')
+ self.att = Attention(name = f'{self.name}__Attention')
def forward(self, inputs, mask = None): # inputs:(batch, seq_len, emb_size), mask:(batch, seq_len)
q = self.query_dense(inputs) # (batch, seq_len, key_dim)
@@ -282,4 +272,3 @@ def forward(self, inputs, mask=None):
for encoder_layer in self.transformer_blocks:
outputs = encoder_layer(inputs=outputs, mask=mask)
return outputs # (batch, seq_len, out_dim) <-- USUALLY out_dim = emb_size
-
diff --git a/lpd/metrics/confusion_matrix.py b/lpd/metrics/confusion_matrix.py
index 4b938de..51f7f51 100644
--- a/lpd/metrics/confusion_matrix.py
+++ b/lpd/metrics/confusion_matrix.py
@@ -148,4 +148,5 @@ def update_state(self, y_pred: T.Tensor, y_true: T.Tensor):
y_true_class_idxs = y_true.long()
for row, col in zip(y_pred_class_idxs, y_true_class_idxs):
- self.confusion[row][col] += 1
+ self.confusion[row.cpu()][col.cpu()] += 1
+
diff --git a/setup-nodeps.py b/setup-nodeps.py
index 21185a6..262f020 100644
--- a/setup-nodeps.py
+++ b/setup-nodeps.py
@@ -27,16 +27,14 @@
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.6',
- 'Programming Language :: Python :: 3.7',
- 'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Utilities'
]
setup(
name='lpd-nodeps',
- version='0.4.8',
+ version='0.4.9',
description='A Fast, Flexible Trainer with Callbacks and Extensions for PyTorch',
long_description_content_type='text/markdown',
long_description=README_md,
@@ -47,7 +45,7 @@
maintainer_email='torch.lpd@gmail.com',
packages=find_packages(exclude=['tests', 'tests/*', 'examples', 'examples/*']),
install_requires=install_requires_nodeps,
- python_requires='>=3.6',
+ python_requires='>=3.9',
classifiers=classifiers,
keywords=['lpd-nodeps']
)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d21f203..d9fe940 100644
--- a/setup.py
+++ b/setup.py
@@ -29,16 +29,14 @@
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.6',
- 'Programming Language :: Python :: 3.7',
- 'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Utilities'
]
setup(
name='lpd',
- version='0.4.8',
+ version='0.4.9',
description='A Fast, Flexible Trainer with Callbacks and Extensions for PyTorch',
long_description_content_type='text/markdown',
long_description=README_md,
@@ -49,7 +47,7 @@
maintainer_email='torch.lpd@gmail.com',
packages=find_packages(exclude=['tests', 'tests/*', 'examples', 'examples/*']),
install_requires=install_requires,
- python_requires='>=3.6',
+ python_requires='>=3.9',
classifiers=classifiers,
keywords=['pytorch,trainer,callback,callbacks,earlystopping,tensorboard,modelcheckpoint,checkpoint,layers,dense,metrics,predictor,binary accuracy,extensions,track,monitor,machine,deep learning,neural,networks,AI,keras decay,confusion matrix']
)
\ No newline at end of file