diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 5565a08..e2b831e 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,6 +1,13 @@ Change Log ========== +0.4.9 (22/01/2023) +----------------- +* Added assert to Attention class (from extensions) when mask is used +* Fixed confusion matrix cpu/gpu device error +* Better handling on callbacks where apply_on_states=None (apply on all states) +* Updated Pipfile + 0.4.8 (15/09/2022) ----------------- diff --git a/Pipfile b/Pipfile index 8293cfd..b94e60b 100644 --- a/Pipfile +++ b/Pipfile @@ -4,8 +4,8 @@ url = "https://pypi.org/simple" verify_ssl = true [dev-packages] -tensorboard = "==2.3.0" -tqdm = "==4.51.0" +tensorboard = "*" +tqdm = "*" [packages] numpy = "*" @@ -14,4 +14,4 @@ torchvision = "*" protobuf = "==3.20.*" [requires] -python_version = "3.7.6" +python_version = "3.9.1" diff --git a/README.md b/README.md index b878bb4..a882beb 100644 --- a/README.md +++ b/README.md @@ -25,9 +25,11 @@ There are 2 types of ``lpd`` packagaes available pip install lpd-nodeps ``` -[v0.4.8-beta](https://github.com/RoySadaka/lpd/releases) Release - contains the following: -* Added AbsoluteThresholdChecker & RelativeThresholdChecker classes -* ThresholdCheckers can now be used in CallbackMonitor to better define metric tracking +[v0.4.9-beta](https://github.com/RoySadaka/lpd/releases) Release - contains the following: +* Added assert to Attention class (from extensions) when mask is used +* Fixed confusion matrix cpu/gpu device error +* Better handling on callbacks where apply_on_states=None (apply on all states) +* Updated Pipfile Previously on lpd: diff --git a/examples/multiple_inputs/model.py b/examples/multiple_inputs/model.py index 1183d8c..4359686 100644 --- a/examples/multiple_inputs/model.py +++ b/examples/multiple_inputs/model.py @@ -1,4 +1,4 @@ -import torch as T +import torch import torch.nn as nn import torch.optim as optim @@ -30,11 +30,11 @@ def __init__(self, config, num_embeddings): drop_out_proba=config.TRANSFORMER_DROP_OUT_PROBA, ff_expansion_rate=config.TRANSFORMER_FF_EXPANSION_RATE) - self.external_query_attention = Attention(key_dim=config.EMBEDDINGS_SIZE, use_query_dense=True) + self.external_query_attention = Attention() self.norm = nn.LayerNorm(normalized_shape=config.EMBEDDINGS_SIZE) # WILL APPLY NORM OVER THE LAST DIMENTION ONLY self.mat_mul2d = MatMul2D(transpose_b=True) - def forward(self, x1, x2, x3): + def forward(self, x1, x2, x3, index_select_aux): # x1 : sequence-Input (batch, num_elements) # x2 : some1-Input (batch, 1) # x3 : some2-Input (batch, 1) @@ -43,14 +43,17 @@ def forward(self, x1, x2, x3): x1_emb_transformed = self.transformer_encoder(x1_emb) # (batch, num_elements, emb_size) x3_emb = self.embedding_layer(x3) # (batch, emb_size) - x3_emb_unsqueesed = x3_emb.unsqueeze(1) # (batch, 1, emb_size) + x3_emb_unsqueeze = x3_emb.unsqueeze(1) # (batch, 1, emb_size) - x1_with_x3_reduced = self.external_query_attention(q=x3_emb_unsqueesed, - k=x1_emb_transformed, - v=x1_emb_transformed) # (batch, 1, emb_size) + x1_with_x3_reduced = torch.cat([x3_emb_unsqueeze, x1_emb_transformed], dim=1) # (batch, num_elements+1, emb_size) + + x1_with_x3_reduced = self.external_query_attention(q=x1_with_x3_reduced, + k=x1_with_x3_reduced, + v=x1_with_x3_reduced) # (batch, num_elements+1, emb_size) + x1_with_x3_reduced = torch.index_select(x1_with_x3_reduced, dim=1, index=index_select_aux) # (batch, 1, emb_size) - x1_with_x3_residual = self.norm(x1_with_x3_reduced + x3_emb_unsqueesed) # (batch, 1, emb_size) + x1_with_x3_residual = self.norm(x1_with_x3_reduced + x3_emb_unsqueeze) # (batch, 1, emb_size) x2_emb = self.embedding_layer(x2) # (batch, emb_size) diff --git a/examples/multiple_inputs/train.py b/examples/multiple_inputs/train.py index 335b475..cf89980 100644 --- a/examples/multiple_inputs/train.py +++ b/examples/multiple_inputs/train.py @@ -1,5 +1,5 @@ import random -import torch as T +import torch import os from .config import Config from .model import get_trainer @@ -12,7 +12,8 @@ def prepare_chunk_to_model_input(config, chunk): x2 = [c[config.IDX_OF_X2] for c in chunk] x3 = [c[config.IDX_OF_X3] for c in chunk] y = [c[config.IDX_OF_LABEL] for c in chunk] - return [T.LongTensor(x1), T.LongTensor(x2), T.LongTensor(x3)], T.Tensor(y) + index_select_aux = torch.LongTensor([0]) + return [torch.LongTensor(x1), torch.LongTensor(x2), torch.LongTensor(x3), index_select_aux], torch.Tensor(y) def get_data_stats(data_generator, verbose=1): sanity_count = int(1e6) diff --git a/lpd/callbacks/callback_base.py b/lpd/callbacks/callback_base.py index 0783536..5fba3d1 100644 --- a/lpd/callbacks/callback_base.py +++ b/lpd/callbacks/callback_base.py @@ -80,7 +80,8 @@ def _extract_apply_on_states(self, apply_on_states): raise ValueError(f'[CallbackBase] - {s} is of type {type(s)}, expected type {State}') return result elif apply_on_states is None: - result.add(apply_on_states) + for state in State: + result.add(state) return result raise ValueError(f'[CallbackBase] - got bad value for apply_on_states') @@ -88,8 +89,8 @@ def _extract_apply_on_states(self, apply_on_states): def _validations(self): if self.apply_on_phase is None: raise ValueError('[CallbackBase] - No callback phase was provided') - if None in self.apply_on_states: - print('[CallbackBase][!] - apply_on_states is None, callback will be applied to all states') + if self.apply_on_states is None: + print('[CallbackBase] - apply_on_states is None, callback will be applied to all states') valid_pairs = { Phase.TRAIN_BEGIN:{None, State.EXTERNAL}, @@ -130,11 +131,4 @@ def should_apply_on_phase(self, callback_context: CallbackContext): raise ValueError('[CallbackBase] - got bad value for apply_on_phase') def should_apply_on_state(self, callback_context: CallbackContext): - if None in self.apply_on_states: - return True - - for state in self.apply_on_states: - if callback_context.trainer_state == state: - return True - - return False \ No newline at end of file + return callback_context.trainer_state in self.apply_on_states \ No newline at end of file diff --git a/lpd/extensions/custom_layers.py b/lpd/extensions/custom_layers.py index 9efef39..d0ac4d6 100644 --- a/lpd/extensions/custom_layers.py +++ b/lpd/extensions/custom_layers.py @@ -41,63 +41,53 @@ def forward(self, inputs): class Attention(nn.Module): """ The architecture is based on the paper “Attention Is All You Need” - Usage (1) - It can be used as Attention in transformer if q,k,v share the same dimensions. - - Usage (2) - It can also be used as a method to aggregate a group of vectors into 1 vector if q dimensions are (batch, 1, key_dim) - that way, instead of using Sum, or Average, you can have a learnable query vector (or a few of them) that will learn the aggregation function. - See example in lpd.examples.multiple_inputs.model, where we define external_query_attention like so: - external_query_attention = Attention(key_dim=config.EMBEDDINGS_SIZE, use_query_dense=True) + Used as the Attention layer in transformer. Args: key_dim - as defined in the paper, the number of expected features in the encoder inputs - use_query_dense - whether to pass q input into another Dense layer, mostly used in Usage (2), to - run q into a transformation that will transform it into the vector space of k and v name - optional, any string to describe this layer """ - def __init__(self, key_dim, use_query_dense=False, name=None): + def __init__(self, name=None): super(Attention, self).__init__() #PARAMS - self.key_dim = key_dim - self.sqrt_key_dim = key_dim ** 0.5 - self.use_query_dense = use_query_dense self.name = name if name else 'attention' #LAYERS self.mat_mul2d = MatMul2D(transpose_b=False, name = f'{self.name}__MatMul2D') self.mat_mul2d_t = MatMul2D(transpose_b=True, name = f'{self.name}__MatMul2DT') self.softmax_last_dim = nn.Softmax(dim=-1) - if self.use_query_dense: - # SOMETIMES WE WANT TO GO THROUGH ANOTHER TRANSFORMATION BEFORE RUNNING THE QUERY, - # FOR EXAMPLE, WHEN THIS IS USED AS A STANDALONE LAYER - self.query_dense = Dense(in_dim=self.key_dim, out_dim=self.key_dim, use_bias=False, activation=None, name = f'{self.name}__Dense') def forward(self, q,k,v, mask = None): - # q: (batch, ?, key_dim) where "?" can be 1 or seq_len - # k: (batch, seq_len, key_dim) - # v: (batch, seq_len, key_dim) - # mask: (batch, 1, seq_len) + # q: (batch, seq_len, emb_dim) + # k: (batch, seq_len, emb_dim) + # v: (batch, seq_len, emb_dim) + # mask: (batch, seq_len) # APPLY ATTENTION: # ( Q * Kt ) # softmax ( ---------- ) * V # ( sqrt(dk) ) - if self.use_query_dense: - q = self.query_dense(q) # (batch, seq_len, key_dim) + if mask is not None: + assert q.shape == k.shape == v.shape, 'Dimensions mismatch, When using mask it is expected that the shape of q,k,v will be identical' - q_k = self.mat_mul2d_t(q, k) # (batch, ?, seq_len) - scores = q_k / self.sqrt_key_dim # (batch, ?, seq_len) + emb_dim = q.shape[-1] + q = q / (emb_dim ** 0.5) # (batch, seq_len, emb_dim) + q_k = self.mat_mul2d_t(q, k) # (batch, seq_len, seq_len) if mask is not None: - mask_ready = torch.log(mask) # (batch, 1, seq_len) - scores = scores + mask_ready # (batch, ?, seq_len) (+= is doing broadcasting) + # PREPARE MASK FOR SOFTMAX ON COLUMNS, WILL ZERO OUT MASKED COLUMNS + mask_ready = torch.log(mask).unsqueeze(-2) # (batch, 1, seq_len) + q_k = q_k + mask_ready # (batch, seq_len, seq_len) (broadcasting op) + + attention_weights = self.softmax_last_dim(q_k) # (batch, seq_len, seq_len) - attention_weights = self.softmax_last_dim(scores) # (batch, ?, seq_len) - - attention_output = self.mat_mul2d(attention_weights, v) # (batch, ?, key_dim) + attention_output = self.mat_mul2d(attention_weights, v) # (batch, seq_len, emb_dim) - return attention_output # (batch, ?, key_dim) + if mask is not None: + # A CLEAN UP THAT WILL RESTORE MASKED ROWS TO THEIR ORIGINAL VALUES + attention_output = (attention_output * mask.unsqueeze(-1)) + (q * (1-mask).unsqueeze(-1)) # (batch, seq_len, emb_dim) + + return attention_output # (batch, seq_len, emb_dim) class AttentionHead(nn.Module): def __init__(self, in_dim, key_dim, name=None): @@ -112,7 +102,7 @@ def __init__(self, in_dim, key_dim, name=None): self.query_dense = Dense(self.in_dim, self.key_dim, use_bias=True, activation=None, name = f'{self.name}__Q-Dense') self.key_dense = Dense(self.in_dim, self.key_dim, use_bias=True, activation=None, name = f'{self.name}__K-Dense') self.value_dense = Dense(self.in_dim, self.key_dim, use_bias=True, activation=None, name = f'{self.name}__V-Dense') - self.att = Attention(self.key_dim, name = f'{self.name}__Attention') + self.att = Attention(name = f'{self.name}__Attention') def forward(self, inputs, mask = None): # inputs:(batch, seq_len, emb_size), mask:(batch, seq_len) q = self.query_dense(inputs) # (batch, seq_len, key_dim) @@ -282,4 +272,3 @@ def forward(self, inputs, mask=None): for encoder_layer in self.transformer_blocks: outputs = encoder_layer(inputs=outputs, mask=mask) return outputs # (batch, seq_len, out_dim) <-- USUALLY out_dim = emb_size - diff --git a/lpd/metrics/confusion_matrix.py b/lpd/metrics/confusion_matrix.py index 4b938de..51f7f51 100644 --- a/lpd/metrics/confusion_matrix.py +++ b/lpd/metrics/confusion_matrix.py @@ -148,4 +148,5 @@ def update_state(self, y_pred: T.Tensor, y_true: T.Tensor): y_true_class_idxs = y_true.long() for row, col in zip(y_pred_class_idxs, y_true_class_idxs): - self.confusion[row][col] += 1 + self.confusion[row.cpu()][col.cpu()] += 1 + diff --git a/setup-nodeps.py b/setup-nodeps.py index 21185a6..262f020 100644 --- a/setup-nodeps.py +++ b/setup-nodeps.py @@ -27,16 +27,14 @@ 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Utilities' ] setup( name='lpd-nodeps', - version='0.4.8', + version='0.4.9', description='A Fast, Flexible Trainer with Callbacks and Extensions for PyTorch', long_description_content_type='text/markdown', long_description=README_md, @@ -47,7 +45,7 @@ maintainer_email='torch.lpd@gmail.com', packages=find_packages(exclude=['tests', 'tests/*', 'examples', 'examples/*']), install_requires=install_requires_nodeps, - python_requires='>=3.6', + python_requires='>=3.9', classifiers=classifiers, keywords=['lpd-nodeps'] ) \ No newline at end of file diff --git a/setup.py b/setup.py index d21f203..d9fe940 100644 --- a/setup.py +++ b/setup.py @@ -29,16 +29,14 @@ 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Utilities' ] setup( name='lpd', - version='0.4.8', + version='0.4.9', description='A Fast, Flexible Trainer with Callbacks and Extensions for PyTorch', long_description_content_type='text/markdown', long_description=README_md, @@ -49,7 +47,7 @@ maintainer_email='torch.lpd@gmail.com', packages=find_packages(exclude=['tests', 'tests/*', 'examples', 'examples/*']), install_requires=install_requires, - python_requires='>=3.6', + python_requires='>=3.9', classifiers=classifiers, keywords=['pytorch,trainer,callback,callbacks,earlystopping,tensorboard,modelcheckpoint,checkpoint,layers,dense,metrics,predictor,binary accuracy,extensions,track,monitor,machine,deep learning,neural,networks,AI,keras decay,confusion matrix'] ) \ No newline at end of file