Skip to content

Commit

Permalink
Merge pull request #15 from brainsqueeze/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
brainsqueeze authored Jul 15, 2022
2 parents 27fd3aa + 542e683 commit 85e44b7
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 54 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ multi_news_t2v*/
**/node_modules

# ignore trained model files
*.tflite
*.onnx
**/scratch*
**/training/
**/validation/
Expand Down
16 changes: 0 additions & 16 deletions examples/configurations/minimal_sequence.yml

This file was deleted.

12 changes: 0 additions & 12 deletions examples/configurations/minimal_transformer.yml

This file was deleted.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

setup(
name="text2vec",
version="2.0.1",
version="2.0.2",
description="Building blocks for text vectorization and embedding",
author="Dave Hollander",
author_url="https://github.com/brainsqueeze",
Expand Down
14 changes: 7 additions & 7 deletions text2vec/models/components/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class ScaledDotAttention(layers.Layer):
"""

def __init__(self):
super().__init__(name="ScaledDotAttention")
super().__init__()
self.neg_inf = tf.constant(-1e9, dtype=tf.float32)

# pylint: disable=missing-function-docstring
Expand Down Expand Up @@ -78,7 +78,7 @@ class BahdanauAttention(layers.Layer):
dims = 12
encoded_sequences = tf.random.uniform(shape=[4, 7, dims])
decoded_sequences = tf.random.uniform(shape=[4, 11, dims])
attention = BahdanauAttention(dims)
attention = BahdanauAttention(dims, drop_rate=0.25)
# self attention
attention(encoded_sequences)
Expand All @@ -89,7 +89,7 @@ class BahdanauAttention(layers.Layer):
"""

def __init__(self, size: int, drop_rate: float = 0.):
super().__init__(name="BahdanauAttention")
super().__init__()

self.hidden = layers.Dense(units=size, activation="tanh")
self.U = tf.Variable(initializers.GlorotUniform()(shape=[size]), name="U", dtype=tf.float32, trainable=True)
Expand Down Expand Up @@ -141,15 +141,15 @@ class SingleHeadAttention(layers.Layer):
V = tf.random.uniform(shape=[4, 5, 12])
# 25% dropout rate
attention = SingleHeadAttention(emb_dims=12, keep_prob=0.75)
attention = SingleHeadAttention(emb_dims=12, drop_rate=0.25)
# masking and dropout turned on
attention(inputs=(Q, K, V), mask_future=True, training=True)
```
"""

def __init__(self, emb_dims, num_layers: int = 8, drop_rate: float = 0.):
super().__init__(name="SingleHeadAttention")
super().__init__()
assert isinstance(num_layers, int) and num_layers > 0

dims = emb_dims
Expand Down Expand Up @@ -205,15 +205,15 @@ class MultiHeadAttention(layers.Layer):
V = tf.random.uniform(shape=[4, 5, 12])
# 25% dropout rate
attention = MultiHeadAttention(emb_dims=12, keep_prob=0.75)
attention = MultiHeadAttention(emb_dims=12, drop_rate=0.25)
# masking and dropout turned on
attention(inputs=(Q, K, V), mask_future=True, training=True)
```
"""

def __init__(self, emb_dims: int, num_layers: int = 8, drop_rate: float = 0.):
super().__init__(name="MultiHeadAttention")
super().__init__()
self.layer_heads = [
SingleHeadAttention(emb_dims=emb_dims, num_layers=num_layers, drop_rate=drop_rate)
for _ in range(num_layers)
Expand Down
6 changes: 5 additions & 1 deletion text2vec/models/components/strings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import tensorflow as tf
from tensorflow.kersa import layers
from tensorflow.keras import layers

from text2vec.models import Tokenizer

Expand Down Expand Up @@ -74,3 +74,7 @@ def call(self, texts: tf.Tensor, substrings: tf.RaggedTensor) -> tf.RaggedTensor
row_lengths=substrings.row_lengths()
)
return tf.ragged.map_flat_values(self.find_match, ragged_texts, tf.strings.join([pre, substrings, post]))

def get_config(self):
base_config = super().get_config()
return {**base_config, "sep": self.sep}
46 changes: 31 additions & 15 deletions text2vec/models/components/text_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,16 @@ class Tokenizer(layers.Layer):
"""

def __init__(self, sep: str = ' '):
super().__init__(name="Tokenizer")
super().__init__()
self.sep = sep

def call(self, corpus):
return tf.strings.split(corpus, self.sep)

def get_config(self):
base_config = super().get_config()
return {**base_config, "sep": self.sep}


class Embed(layers.Layer):
"""This layer handles the primary text feature transformations and word-embeddings to be passed off
Expand Down Expand Up @@ -113,7 +117,7 @@ def get_embedding(self, token_ids: tf.RaggedTensor) -> tf.RaggedTensor:
return tf.ragged.map_flat_values(tf.nn.embedding_lookup, self.embeddings, token_ids)


class TokenEmbed(tf.keras.layers.Layer):
class TokenEmbed(layers.Layer):
"""This layer handles the primary text feature transformations and word-embeddings to be passed off
to the sequence-aware parts of the encoder/decoder pipeline.
Expand Down Expand Up @@ -155,23 +159,26 @@ class TokenEmbed(tf.keras.layers.Layer):
def __init__(self, token_hash: dict, embedding_size: int, max_sequence_len: int, unknown_token: str = '<unk>'):
super().__init__()

self.table = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(
keys=list(token_hash.keys()),
values=list(token_hash.values())
),
default_value=token_hash.get(unknown_token)
)
self.lookup = token_hash
self.unknown_token = unknown_token

with tf.init_scope():
self.table = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(
keys=list(token_hash.keys()),
values=list(token_hash.values())
),
default_value=token_hash.get(unknown_token)
)
self.embed_layer = Embed(
vocab_size=len(token_hash),
embedding_size=embedding_size,
max_sequence_len=max_sequence_len
)

def call(self, tokens, **kwargs):
with tf.name_scope("TextInput"):
hashed = tf.ragged.map_flat_values(self.table.lookup, tokens)
return self.embed_layer(hashed, **kwargs)
hashed = tf.ragged.map_flat_values(self.table.lookup, tokens)
return self.embed_layer(hashed, **kwargs)

def get_embedding(self, tokens: tf.RaggedTensor) -> tf.RaggedTensor:
"""Get the token embeddings for the input tokens.
Expand All @@ -187,9 +194,18 @@ def get_embedding(self, tokens: tf.RaggedTensor) -> tf.RaggedTensor:
Sequences of token embeddings with the same number of time steps as `tokens`
"""

with tf.name_scope("TextToEmbedding"):
hashed = tf.ragged.map_flat_values(self.table.lookup, tokens)
return self.embed_layer.get_embedding(hashed)
hashed = tf.ragged.map_flat_values(self.table.lookup, tokens)
return self.embed_layer.get_embedding(hashed)

def get_config(self):
base_config = super().get_config()
return {
**base_config,
"token_hash": self.lookup,
"embedding_size": int(tf.shape(self.embeddings)[1].numpy()),
"max_sequence_len": int(self.embed_layer.max_len.numpy()),
"unknown_token": self.unknown_token
}

@property
def slicer(self):
Expand Down
4 changes: 2 additions & 2 deletions text2vec/models/components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class LayerNorm(layers.Layer):
"""

def __init__(self, epsilon: float = 1e-8, scale: float = 1.0, bias: float = 0):
super().__init__(name="LayerNorm")
super().__init__()
self.epsilon = tf.constant(epsilon, dtype=tf.float32)
self.scale = tf.constant(scale, dtype=tf.float32)
self.bias = tf.constant(bias, dtype=tf.float32)
Expand Down Expand Up @@ -60,7 +60,7 @@ class TensorProjection(layers.Layer):
"""

def __init__(self):
super().__init__(name="TensorProjection")
super().__init__()

def call(self, x, projection_vector):
projection_vector = tf.math.l2_normalize(projection_vector, axis=-1)
Expand Down

0 comments on commit 85e44b7

Please sign in to comment.