From cb58448858181db6fc80ba78e018e931b545875e Mon Sep 17 00:00:00 2001 From: Kye Date: Fri, 5 Apr 2024 22:47:32 -0400 Subject: [PATCH] [FEAT]-[Module]: [return_loss_text]: Add [return_loss_text] function for enhanced loss computation readability [FEAT]-[Module]: [calc_z_loss]: Introduce [calc_z_loss] function to calculate Z loss in model training [FEAT]-[Module]: [max_neg_value]: Implement [max_neg_value] function for negative value handling in computations [FEAT]-[Module]: [TextTokenEmbedding]: Deploy [TextTokenEmbedding] for improved text token embedding functionality [FEAT]-[Module]: [dropout_seq]: Add [dropout_seq] function for sequence dropout in neural network layers [FEAT]-[Module]: [transformer_generate]: Introduce [transformer_generate] function for efficient transformer text generation [FEAT]-[Module]: [vit_output_head]: Add [vit_output_head] for Vision Transformer model output handling [FEAT]-[Module]: [patch_linear_flatten]: Implement [patch_linear_flatten] for streamlined linear patch flattening in ViT [FEAT]-[Module]: [ScalableImgSelfAttention]: Introduce [ScalableImgSelfAttention] for scalable image self-attention mechanism ] --- playground/models/spectra.py | 0 pyproject.toml | 3 +- zeta/nn/attention/__init__.py | 6 +- zeta/nn/attention/scalable_img_self_attn.py | 129 +++++++++++++ zeta/nn/modules/__init__.py | 24 +++ zeta/nn/modules/chan_layer_norm.py | 37 ++++ zeta/nn/modules/patch_linear_flatten.py | 88 +++++++++ zeta/nn/modules/peg.py | 34 ++++ zeta/nn/modules/return_loss_text.py | 196 ++++++++++++++++++++ zeta/structs/auto_regressive_wrapper.py | 28 ++- 10 files changed, 530 insertions(+), 15 deletions(-) create mode 100644 playground/models/spectra.py create mode 100644 zeta/nn/attention/scalable_img_self_attn.py create mode 100644 zeta/nn/modules/chan_layer_norm.py create mode 100644 zeta/nn/modules/patch_linear_flatten.py create mode 100644 zeta/nn/modules/peg.py create mode 100644 zeta/nn/modules/return_loss_text.py diff --git a/playground/models/spectra.py b/playground/models/spectra.py new file mode 100644 index 00000000..e69de29b diff --git a/pyproject.toml b/pyproject.toml index 55979046..17bcfe24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "zetascale" -version = "2.3.3" +version = "2.3.5" description = "Rapidly Build, Optimize, and Deploy SOTA AI Models" authors = ["Zeta Team "] license = "MIT" @@ -35,6 +35,7 @@ tqdm = "4.66.2" rich = "13.7.1" colt5-attention = "*" argparse = "^1.4.0" +local-attention = "*" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/zeta/nn/attention/__init__.py b/zeta/nn/attention/__init__.py index 1f55a15c..563c96a2 100644 --- a/zeta/nn/attention/__init__.py +++ b/zeta/nn/attention/__init__.py @@ -22,10 +22,7 @@ from zeta.nn.attention.spatial_linear_attention import SpatialLinearAttention from zeta.structs.transformer import Attention, AttentionLayers from zeta.nn.attention.multi_grouped_attn import MultiGroupedQueryAttn - -# from zeta.nn.attention.flash_attention2 import FlashAttentionTwo -# from zeta.nn.attention.mgqa import MGQA - +from zeta.nn.attention.scalable_img_self_attn import ScalableImgSelfAttention __all__ = [ "Attend", @@ -48,4 +45,5 @@ "Attention", "AttentionLayers", "MultiGroupedQueryAttn", + "ScalableImgSelfAttention", ] diff --git a/zeta/nn/attention/scalable_img_self_attn.py b/zeta/nn/attention/scalable_img_self_attn.py new file mode 100644 index 00000000..7a885c01 --- /dev/null +++ b/zeta/nn/attention/scalable_img_self_attn.py @@ -0,0 +1,129 @@ +import torch +from torch import nn, Tensor +from zeta.nn.modules.chan_layer_norm import ChanLayerNorm +from einops import rearrange + + +class ScalableImgSelfAttention(nn.Module): + """ + ScalableImgSelfAttention module applies self-attention mechanism to image data. + + Args: + dim (int): The input dimension of the image. + heads (int, optional): The number of attention heads. Defaults to 8. + dim_key (int, optional): The dimension of the key vectors. Defaults to 32. + dim_value (int, optional): The dimension of the value vectors. Defaults to 32. + dropout (float, optional): The dropout rate. Defaults to 0.0. + reduction_factor (int, optional): The reduction factor for downscaling the image. Defaults to 1. + + Attributes: + dim (int): The input dimension of the image. + heads (int): The number of attention heads. + dim_key (int): The dimension of the key vectors. + dim_value (int): The dimension of the value vectors. + reduction_factor (int): The reduction factor for downscaling the image. + scale (float): The scaling factor for the key vectors. + attend (nn.Softmax): The softmax function for attention calculation. + dropout (nn.Dropout): The dropout layer. + norm (ChanLayerNorm): The channel-wise layer normalization. + to_q (nn.Conv2d): The convolutional layer for query projection. + to_k (nn.Conv2d): The convolutional layer for key projection. + to_v (nn.Conv2d): The convolutional layer for value projection. + to_out (nn.Sequential): The sequential layer for output projection. + + """ + + def __init__( + self, + dim: int, + heads: int = 8, + dim_key: int = 32, + dim_value: int = 32, + dropout: float = 0.0, + reduction_factor: int = 1, + *args, + **kwargs, + ): + super().__init__() + self.dim = dim + self.heads = heads + self.dim_key = dim_key + self.dim_value = dim_value + self.reduction_factor = reduction_factor + + self.scale = dim_key**-0.5 + self.attend = nn.Softmax(dim=-1) + self.dropout = nn.Dropout(dropout) + self.norm = ChanLayerNorm(dim) + + # Projections + self.to_q = nn.Conv2d(dim, dim_key * heads, 1, bias=False) + self.to_k = nn.Conv2d( + dim, + dim_key * heads, + reduction_factor, + stride=reduction_factor, + bias=False, + ) + self.to_v = nn.Conv2d( + dim, + dim_value * heads, + reduction_factor, + stride=reduction_factor, + bias=False, + ) + + self.to_out = nn.Sequential( + nn.Conv2d(dim_value * heads, dim, 1), nn.Dropout(dropout) + ) + + def forward(self, x: Tensor) -> Tensor: + """ + Forward pass of the ScalableImgSelfAttention module. + + Args: + x (Tensor): The input tensor of shape (batch_size, channels, height, width). + + Returns: + Tensor: The output tensor of shape (batch_size, channels, height, width). + + """ + h, w, h = *x.shape[-2:], self.heads + + x = self.norm(x) + + q, k, v = self.to_q(x), self.to_k(x), self.to_v(x) + + # Split out heads + q, k, v = map( + lambda t: rearrange(t, "b (h d) ... -> b h (...) d", h=h), + ( + q, + k, + ), + ) + + # Similarity + dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale + + # Attention + attn = self.attend(dots) + attn = self.dropout(attn) + + # Aggregate values + out = torch.matmul(attn, v) + + # Merge back heads + out = rearrange( + out, + "b h (x y) d -> b (h d) x y", + x=h, + y=w, + ) + return self.to_out(out) + + +# x = torch.randn(1, 3, 64, 64) +# peg = ScalableImgSelfAttention(3) +# out = peg(x) +# print(out.shape) diff --git a/zeta/nn/modules/__init__.py b/zeta/nn/modules/__init__.py index 52491e3d..f8fcc0be 100644 --- a/zeta/nn/modules/__init__.py +++ b/zeta/nn/modules/__init__.py @@ -195,6 +195,20 @@ NormalSparseMoE, HeirarchicalSparseMoE, ) +from zeta.nn.modules.return_loss_text import ( + return_loss_text, + calc_z_loss, + max_neg_value, + TextTokenEmbedding, + dropout_seq, + transformer_generate, +) +from zeta.nn.modules.patch_linear_flatten import ( + vit_output_head, + patch_linear_flatten, +) +from zeta.nn.modules.chan_layer_norm import ChanLayerNorm + # from zeta.nn.modules.img_reshape import image_reshape # from zeta.nn.modules.flatten_features import flatten_features @@ -392,4 +406,14 @@ "Top2Gating", "NormalSparseMoE", "HeirarchicalSparseMoE", + "return_loss_text", + "calc_z_loss", + "max_neg_value", + "TextTokenEmbedding", + "dropout_seq", + "transformer_generate", + "patch_linear_flatten", + "vit_output_head", + "posemb_sincos_2d", + "ChanLayerNorm", ] diff --git a/zeta/nn/modules/chan_layer_norm.py b/zeta/nn/modules/chan_layer_norm.py new file mode 100644 index 00000000..72c835d9 --- /dev/null +++ b/zeta/nn/modules/chan_layer_norm.py @@ -0,0 +1,37 @@ +import torch +from torch import nn, Tensor + + +class ChanLayerNorm(nn.Module): + def __init__(self, dim: int, eps: float = 1e-5): + """ + Initializes the ChanLayerNorm module. + + Args: + dim (int): The input dimension. + eps (float, optional): The epsilon value. Defaults to 1e-5. + """ + super().__init__() + self.dim = dim + self.eps = eps + self.g = nn.Parameter(torch.ones(1, dim, 1, 1)) + self.b = nn.Parameter(torch.zeros(1, dim, 1, 1)) + + def forward(self, x: Tensor): + """ + Forward pass of the ChanLayerNorm module. + + Args: + x (Tensor): The input tensor. + + Returns: + Tensor: The normalized tensor. + """ + var = torch.car( + x, + dim=1, + unbiased=False, + keepdim=True, + ) + mean = torch.mean(x, dim=1, keepdim=True) + return (x - mean) / (var + self.eps).sqrt() * self.g + self.b diff --git a/zeta/nn/modules/patch_linear_flatten.py b/zeta/nn/modules/patch_linear_flatten.py new file mode 100644 index 00000000..43fd786a --- /dev/null +++ b/zeta/nn/modules/patch_linear_flatten.py @@ -0,0 +1,88 @@ +import torch +from torch import nn, Tensor +from einops.layers.torch import Rearrange + + +def posemb_sincos_2d(patches, temperature=10000, dtype=torch.float32): + _, h, w, dim, device, dtype = *patches.shape, patches.device, patches.dtype + + y, x = torch.meshgrid( + torch.arange(h, device=device), + torch.arange(w, device=device), + indexing="ij", + ) + assert ( + dim % 4 + ) == 0, "feature dimension must be multiple of 4 for sincos emb" + omega = torch.arange(dim // 4, device=device) / (dim // 4 - 1) + omega = 1.0 / (temperature**omega) + + y = y.flatten()[:, None] * omega[None, :] + x = x.flatten()[:, None] * omega[None, :] + pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim=1) + return pe.type(dtype) + + +def vit_output_head(x: Tensor, dim: int, num_classes: int = None): + """ + Applies a Vision Transformer (ViT) output head to the input tensor. + + Args: + x (Tensor): The input tensor. + dim (int): The dimension of the input tensor. + num_classes (int, optional): The number of output classes. Defaults to None. + + Returns: + Tensor: The output tensor after applying the ViT output head. + """ + return nn.Sequential(nn.LayerNorm(dim), nn.Linear(dim, num_classes))(x) + + +def patch_linear_flatten( + x: Tensor, + patch_size: int, + dim: int, + image_size: int, + channels: int = 3, + add_pos_embeddings: bool = False, + *args, + **kwargs, +): + """ + Applies patch embedding to the input tensor and flattens it. + + Args: + x (Tensor): Input tensor of shape (batch_size, channels, image_height, image_width). + patch_size (int): Size of the square patch. + dim (int): Dimension of the output tensor. + image_size (int): Size of the input image (assumed to be square). + channels (int, optional): Number of input channels. Defaults to 3. + add_pos_embeddings (bool, optional): Whether to add positional embeddings. Defaults to False. + + Returns: + Tensor: Flattened tensor of shape (batch_size, num_patches, dim). + """ + image_height, image_width = image_size, image_size + patch_height, patch_width = patch_size, patch_size + + # calculate number of patches + (image_height // patch_height) * (image_width // patch_width) + patch_dim = channels * patch_height * patch_width + + # Patch Embedding layer + to_patch_embeddings = nn.Sequential( + Rearrange( + "b c (h p1) (w p2) -> b h w (p1 p2 c)", + p1=patch_height, + p2=patch_width, + ), + nn.LayerNorm(patch_dim), + nn.Linear(patch_dim, dim), + nn.LayerNorm(dim), + )(x) + + if add_pos_embeddings is not False: + pos_embeddings = posemb_sincos_2d(x, *args, **kwargs) + to_patch_embeddings + +pos_embeddings + + return to_patch_embeddings diff --git a/zeta/nn/modules/peg.py b/zeta/nn/modules/peg.py new file mode 100644 index 00000000..c1f18287 --- /dev/null +++ b/zeta/nn/modules/peg.py @@ -0,0 +1,34 @@ +from torch import nn, Tensor + + +class PEG(nn.Module): + """ + PEG (Positional Encoding Generator) module. + + Args: + dim (int): The input dimension. + kernel_size (int, optional): The size of the convolutional kernel. Defaults to 3. + """ + + def __init__(self, dim: int, kernel_size: int = 3): + super().__init__() + self.proj = nn.Conv2d( + dim, + dim, + kernel_size=kernel_size, + padding=kernel_size // 2, + groups=dim, + stride=1, + ) + + def forward(self, x: Tensor): + """ + Forward pass of the PEG module. + + Args: + x (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + return self.proj(x) + x diff --git a/zeta/nn/modules/return_loss_text.py b/zeta/nn/modules/return_loss_text.py new file mode 100644 index 00000000..7a8dd132 --- /dev/null +++ b/zeta/nn/modules/return_loss_text.py @@ -0,0 +1,196 @@ +import torch +from einops import rearrange +import torch.nn.functional as F +from torch import Tensor +from torch import nn +from zeta.structs.auto_regressive_wrapper import AutoRegressiveWrapper +from typing import List +from einops import reduce + + +def exists(val): + return val is not None + + +def return_loss_text( + x: Tensor, logits: Tensor, labels: Tensor, ignore_index, mask: Tensor +): + """ + Computes the cross-entropy loss between the predicted logits and the target labels. + + Args: + logits (Tensor): The predicted logits of shape (batch_size, num_classes, sequence_length). + labels (Tensor): The target labels of shape (batch_size, sequence_length). + ignore_index (int): The index to ignore when computing the loss. + + Returns: + Tensor: The computed cross-entropy loss. + """ + seq, labels = x[:, :-1], x[:, 1:] + + labels = labels.masked_fill(~mask[:, 1:], ignore_index) + + loss = F.cross_entropy( + rearrange(logits, "b n c -> b c n"), labels, ignore_index=ignore_index + ) + + return loss + + +def add_masking_llm(x: Tensor, mask: Tensor, ignore_index: int): + """ + Adds masking to the input tensor. + + Args: + x (Tensor): The input tensor. + ignore_index (int): The index to ignore. + + Returns: + Tensor: The masked input tensor. + """ + ... + + +def calc_z_loss( + pre_softmax_attns: List[Tensor], mask: Tensor = None, weight: float = 1.0 +): + lse = 0.0 + + for attn in pre_softmax_attns: + lse = lse + attn.logsumexp(dim=-1) + + loss = torch.square(lse) + loss = reduce(loss, "b h n -> b n", "sum") + + if not exists(mask): + return loss.mean() * weight + + loss = loss[mask].sum() / mask.sum().clamp(min=1e-5) + return loss * weight + + +def max_neg_value(tensor: Tensor): + return -torch.finfo(tensor.dtype).max + + +def l2norm(x: Tensor, groups: int = 1): + """ + Applies L2 normalization to the input tensor. + + Args: + x (Tensor): The input tensor to be normalized. + groups (int, optional): The number of groups to divide the input tensor into. Defaults to 1. + + Returns: + Tensor: The normalized tensor. + + """ + x = rearrange(x, "... (g d) -> ... g d", g=groups) + x = F.normalize(x, p=2, dim=-1) + return rearrange(x, "... g d -> ... (g d)") + + +class TextTokenEmbedding(nn.Module): + def __init__( + self, + dim: int, + num_tokens: int, + l2norm_embed: bool = True, + ): + """ + Initializes a TextTokenEmbedding module. + + Args: + dim (int): The dimension of the embedding. + num_tokens (int): The number of tokens in the vocabulary. + l2norm_embed (bool, optional): Whether to apply L2 normalization to the embeddings. Defaults to True. + """ + super().__init__() + self.dim = dim + self.num_tokens = num_tokens + self.l2norm_embed = l2norm_embed + self.embed = nn.Embedding(num_tokens, dim) + + def forward(self, x: Tensor): + """ + Forward pass of the TextTokenEmbedding module. + + Args: + x (Tensor): The input tensor of shape (batch_size, sequence_length). + + Returns: + Tensor: The embedded tensor of shape (batch_size, sequence_length, dim). + """ + token_embed = self.embed(x.long()) + return l2norm(token_embed) if self.l2norm_embed else token_embed + + +def dropout_seq(seq: Tensor, mask: Tensor, dropout: float = 0.0): + """ + Applies dropout to a sequence of tensors. + + Args: + seq (Tensor): The input sequence tensor of shape (batch_size, sequence_length, ...). + mask (Tensor): The mask tensor of shape (batch_size, sequence_length) indicating which elements to keep. + dropout (float, optional): The dropout probability. Defaults to 0. + + Returns: + Tuple[Tensor, Tensor]: A tuple containing the modified sequence tensor and the modified mask tensor. + + """ + b, n, *_, device = *seq.shape, seq.device + logits = torch.randn(b, n, device=device) + + if exists(mask): + mask_value = max_neg_value(logits) + logits = logits.masked_fill(~mask, mask_value) + + keep_prob = 1.0 - dropout + num_keep = max(1, int(keep_prob * n)) + keep_indices = logits.topk(num_keep, dim=1).indices + + batch_indices = torch.arange(b, device=device) + batch_indices = rearrange(batch_indices, "b -> b 1") + + seq = seq[batch_indices, keep_indices] + + if exists(mask): + seq_counts = mask.sum(dim=-1) + seq_keep_counts = torch.ceil(seq_counts * keep_prob).int() + keep_mask = torch.arange(num_keep, device=device) < rearrange( + seq_keep_counts, "b -> b 1" + ) + + mask = mask[batch_indices, keep_indices] & keep_mask + + return seq, mask + + +@torch.no_grad() +def transformer_generate( + model: nn.Module, + prompt: Tensor, + temperature: float = 0.5, + filter_threshold: float = 0.9, + *args, + **kwargs, +): + """ + Generates text given a prompt. + + Args: + model (nn.Module): The model to generate text. + prompt (Tensor): The prompt tensor. + + Returns: + Tensor: The generated text. + """ + model = AutoRegressiveWrapper(net=model) + + return model.generate( + prompt, + filter_thres=filter_threshold, + temperature=temperature, + *args, + **kwargs, + ) diff --git a/zeta/structs/auto_regressive_wrapper.py b/zeta/structs/auto_regressive_wrapper.py index a7df7879..3f77cbb5 100644 --- a/zeta/structs/auto_regressive_wrapper.py +++ b/zeta/structs/auto_regressive_wrapper.py @@ -1,16 +1,16 @@ import torch import torch.nn.functional as F from einops import pack, rearrange, unpack -from torch import nn +from torch import Tensor, nn -from zeta.utils.main import once # noqa: F401 from zeta.utils.main import ( eval_decorator, exists, + once, # noqa: F401 top_a, top_k, top_p, -) # noqa: E402 +) # Utils @@ -86,7 +86,7 @@ def contrastive_guidance(self, logits, k): return torch.multinomial(F.softmax(top_k_logits, dim=-1), 1) -class AutoregressiveWrapper(nn.Module): +class AutoRegressiveWrapper(nn.Module): """ Auto-regressive wrapper for any nn.Module that takes in a sequence of @@ -114,11 +114,11 @@ class AutoregressiveWrapper(nn.Module): def __init__( self, - net, - ignore_index=-100, - pad_value=0, - mask_prob=0.0, - speculative=False, + net: nn.Module, + ignore_index: int = -100, + pad_value: int = 0, + mask_prob: float = 0.0, + speculative: bool = False, ): super().__init__() self.pad_value = pad_value @@ -138,7 +138,7 @@ def __init__( def generate( self, start_tokens, - seq_len, + seq_len: int, eos_token=None, strategy="temperature", temperature=1.0, @@ -352,3 +352,11 @@ def evaluate_and_select_best_solution( def grade_solution(self, solution): """Grade a solution.""" + ... + return self.net(solution) + + def majority_voting(self, task: Tensor): + """ + Majority voting. + """ + ...