diff --git a/fractoral_norm.py b/fractoral_norm.py new file mode 100644 index 00000000..832509e5 --- /dev/null +++ b/fractoral_norm.py @@ -0,0 +1,10 @@ +from zeta.nn import FractoralNorm # Importing the FractoralNorm class from the zeta.nn module +import torch # Importing the torch module for tensor operations + +# Norm +x = torch.randn(2, 3, 4) # Generating a random tensor of size (2, 3, 4) + +# FractoralNorm +normed = FractoralNorm(4, 4)(x) # Applying the FractoralNorm operation to the tensor x + +print(normed) # Printing the size of the resulting tensor, which should be torch.Size([2, 3, 4]) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 23f66592..70f5533d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "zetascale" -version = "2.4.5" +version = "2.4.6" description = "Rapidly Build, Optimize, and Deploy SOTA AI Models" authors = ["Zeta Team "] license = "MIT" @@ -16,7 +16,7 @@ packages = [ ] [tool.poetry.dependencies] -python = "^3.9" +python = "^3.10" torch = ">=2.1.1,<3.0" pytest = "8.1.1" torchfix = "*" diff --git a/Dockerfile b/scripts/Dockerfile similarity index 100% rename from Dockerfile rename to scripts/Dockerfile diff --git a/zeta/__init__.py b/zeta/__init__.py index 22e2a8c9..dc752fd4 100644 --- a/zeta/__init__.py +++ b/zeta/__init__.py @@ -9,8 +9,10 @@ from zeta.optim import * # noqa: F403, E402 from zeta.quant import * # noqa: F403, E402 from zeta.rl import * # noqa: F403, E402 - -# from zeta.tokenizers import * # noqa: F403, E402 from zeta.training import * # noqa: F403, E402 from zeta.utils import * # noqa: F403, E402 -from zeta.experimental import * # noqa: F403, E402 + +try: + from zeta.experimental import * # noqa: F403, E402 +except ImportError: + pass diff --git a/zeta/nn/modules/__init__.py b/zeta/nn/modules/__init__.py index d960e0db..fc2bf595 100644 --- a/zeta/nn/modules/__init__.py +++ b/zeta/nn/modules/__init__.py @@ -214,7 +214,8 @@ from zeta.nn.modules.query_proposal import TextHawkQueryProposal from zeta.nn.modules.pixel_shuffling import PixelShuffleDownscale from zeta.nn.modules.kan import KAN - +from zeta.nn.modules.layer_scale import LayerScale +from zeta.nn.modules.fractoral_norm import FractoralNorm # from zeta.nn.modules.img_reshape import image_reshape # from zeta.nn.modules.flatten_features import flatten_features # from zeta.nn.modules.scaled_sinusoidal import ScaledSinuosidalEmbedding @@ -426,4 +427,6 @@ "TextHawkQueryProposal", "PixelShuffleDownscale", "KAN", + "LayerScale", + "FractoralNorm", ] diff --git a/zeta/nn/modules/feedforward.py b/zeta/nn/modules/feedforward.py index 5ab22882..bee66c71 100644 --- a/zeta/nn/modules/feedforward.py +++ b/zeta/nn/modules/feedforward.py @@ -3,7 +3,7 @@ from zeta.nn.modules.glu import GLU from zeta.nn.modules.swiglu import SwiGLU from typing import Optional -from zeta.experimental.triton.triton_modules.linear_proj import LinearTriton +# from zeta.experimental.triton.triton_modules.linear_proj import LinearTriton class ReluSquared(nn.Module): @@ -95,10 +95,10 @@ def __init__( project_in = GLU( dim, inner_dim, activation, mult_bias=glu_mult_bias ) - elif triton_kernels_on is True: - project_in = nn.Sequential( - LinearTriton(dim, inner_dim, bias=no_bias), activation - ) + # elif triton_kernels_on is True: + # project_in = nn.Sequential( + # LinearTriton(dim, inner_dim, bias=no_bias), activation + # ) else: project_in = nn.Sequential( nn.Linear(dim, inner_dim, bias=not no_bias), activation diff --git a/zeta/nn/modules/fractoral_norm.py b/zeta/nn/modules/fractoral_norm.py index bf4ccf84..9d68beee 100644 --- a/zeta/nn/modules/fractoral_norm.py +++ b/zeta/nn/modules/fractoral_norm.py @@ -10,11 +10,11 @@ class FractoralNorm(nn.Module): depth (int): Number of times to apply LayerNorm. """ - def __init__(self, num_features: int, depth: int): + def __init__(self, num_features: int, depth: int, *args, **kwargs): super().__init__() self.layers = nn.ModuleList( - [nn.LayerNorm(num_features) for _ in range(depth)] + [nn.LayerNorm(num_features, *args, **kwargs) for _ in range(depth)] ) def forward(self, x: Tensor) -> Tensor: diff --git a/zeta/nn/modules/layer_scale.py b/zeta/nn/modules/layer_scale.py new file mode 100644 index 00000000..58e5083c --- /dev/null +++ b/zeta/nn/modules/layer_scale.py @@ -0,0 +1,32 @@ +from torch.nn import Module +import torch +from torch import nn, Tensor + +class LayerScale(Module): + """ + Applies layer scaling to the output of a given module. + + Args: + fn (Module): The module to apply layer scaling to. + dim (int): The dimension along which to apply the scaling. + init_value (float, optional): The initial value for the scaling factor. Defaults to 0. + + Attributes: + fn (Module): The module to apply layer scaling to. + gamma (Parameter): The scaling factor parameter. + + """ + + def __init__(self, fn: Module, dim, init_value=0.): + super().__init__() + self.fn = fn + self.gamma = nn.Parameter(torch.ones(dim) * init_value) + + def forward(self, x, **kwargs): + out = self.fn(x, **kwargs) + + if isinstance(out, Tensor): + return out * self.gamma + + out, *rest = out + return out * self.gamma, *rest \ No newline at end of file