Upload 10 files

Browse files

Files changed (10) hide show

.gitignore +5 -0
__init__.py +1 -0
hf_utils.py +15 -0
mamba_block.py +354 -0
mamba_config.py +86 -0
mamba_model.py +183 -0
mlp.py +43 -0
setup.py +159 -0
switch_mlp.py +91 -0
utils.py +82 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+*__pycache__/
+*.egg-info/
+build/
+**.so
+**.ipynb

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

hf_utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import json
+import torch
+import transformers
+from transformers.utils import WEIGHTS_NAME, CONFIG_NAME
+from transformers.utils.hub import cached_file
+def load_config_hf(model_name):
+    resolved_archive_file = cached_file(model_name, CONFIG_NAME, _raise_exceptions_for_missing_entries=False)
+    return json.load(open(resolved_archive_file))
+def load_state_dict_hf(model_name, device="cpu"):
+    resolved_archive_file = cached_file(model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False)
+    return torch.load(resolved_archive_file, map_location=device)

mamba_block.py ADDED Viewed

	@@ -0,0 +1,354 @@

+import math
+from typing import Optional, Union
+import re
+from contextlib import nullcontext
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+import functools
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from einops import rearrange, repeat
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None, None
+try:
+    from ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
+except ImportError:
+    selective_scan_fn, mamba_inner_fn = None, None
+try:
+    from ops.triton.selective_state_update import selective_state_update
+except ImportError:
+    selective_state_update = None
+try:
+    from ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+from mamba_layer import MambaLayer
+from mamba_config import MambaConfig
+from mlp import MLP
+from switch_mlp import SwitchMLP
+class MambaBlock(nn.Module):
+    def __init__(
+        self, config, mixer_cls, moe_cls=None, norm_cls=nn.LayerNorm, fused_add_norm=False, residual_in_fp32=False
+    ):
+        super().__init__()
+        self.config = config
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.mixer = mixer_cls(config)
+        if not config.rms_norm:
+            self.norm = norm_cls
+        else:
+            self.norm = norm_cls(config.hidden_size)
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+        if moe_cls is not None:
+            self.moe = moe_cls(config)
+        else:
+            self.moe = None
+    def forward(
+        self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None
+    ):
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm, RMSNorm) else layer_norm_fn
+            hidden_states, residual = fused_add_norm_fn(
+                hidden_states,
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+        hidden_states = self.mixer(hidden_states, inference_params=inference_params)
+        return hidden_states , residual
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+class MambaBlockParallelMoe(nn.Module):
+    def __init__(
+        self, config, mixer_cls, moe_cls=None, norm_cls=nn.LayerNorm, norm_moe=nn.LayerNorm, fused_add_norm=False, residual_in_fp32=False
+    ):
+        super().__init__()
+        self.config = config
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.mixer = mixer_cls(config)
+        if not config.rms_norm:
+            self.norm = norm_cls
+            self.norm_moe = norm_moe
+        else:
+            self.norm = norm_cls(config.hidden_size)
+            self.norm_moe = norm_moe(config.hidden_size)
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+            assert isinstance(
+                self.norm_moe, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+        if moe_cls is not None:
+            self.moe = moe_cls(config)
+        else:
+            self.moe = None
+    def forward(
+        self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None
+    ):
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+            hidden_states_moe = self.norm_moe(residual.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm, RMSNorm) else layer_norm_fn
+            hidden_states, residual = fused_add_norm_fn(
+                hidden_states,
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+            hidden_states_moe, _ = fused_add_norm_fn(
+                hidden_states,
+                self.norm_moe.weight,
+                self.norm_moe.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm_moe.eps,
+            )
+        hidden_states = self.mixer(hidden_states, inference_params=inference_params)
+        hidden_states_moe = self.moe(hidden_states_moe)
+        hidden_states += hidden_states_moe
+        return hidden_states , residual
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+class MoEBlock(nn.Module):
+    def __init__(
+        self, config, mixer_cls, moe_cls=None, norm_cls=nn.LayerNorm, fused_add_norm=False, residual_in_fp32=False
+    ):
+        super().__init__()
+        self.config = config
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.mixer = mixer_cls(config)
+        if not config.rms_norm:
+            self.norm = norm_cls
+        else:
+            self.norm = norm_cls(config.hidden_size)
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+        if moe_cls is not None:
+            self.moe = moe_cls(config)
+        else:
+            self.moe = None
+    def forward(
+        self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None
+    ):
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm, RMSNorm) else layer_norm_fn
+            hidden_states, residual = fused_add_norm_fn(
+                hidden_states,
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+        hidden_states = self.mixer(hidden_states)
+        return hidden_states , residual
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+def create_block(config, layer_idx):
+    if config.rms_norm:
+        norm_cls = partial(RMSNorm, eps=config.layernorm_epsilon)
+    else:
+        norm_cls = partial(nn.LayerNorm if not config.rms_norm else RMSNorm, eps=config.layernorm_epsilon)
+    if (not config.mamba_moe_layers) or config.mamba_moe_layers[layer_idx-1][0] == 'r':
+        if (not config.mamba_moe_layers) or len(config.mamba_moe_layers[layer_idx-1]) == 1:
+            mixer_cls = partial(MambaLayer, layer_idx=layer_idx)
+            block = MambaBlock(
+                config,
+                mixer_cls=mixer_cls,
+                norm_cls=norm_cls,
+                fused_add_norm=config.fused_add_norm,
+                residual_in_fp32=config.residual_in_fp32,
+            )
+        else:
+            if config.mamba_moe_layers[layer_idx-1][1] == '1':
+                if config.rms_norm:
+                    norm_moe = partial(RMSNorm, eps=config.layernorm_epsilon)
+                else:
+                    norm_moe = partial(
+                        nn.LayerNorm if not config.rms_norm else RMSNorm, eps=config.layernorm_epsilon
+                    )
+                mixer_cls = partial(MambaLayer, layer_idx=layer_idx)
+                moe_cls = partial(MLP, layer_idx=layer_idx)
+                block = MambaBlockParallelMoe(
+                config,
+                mixer_cls=mixer_cls,
+                moe_cls=moe_cls,
+                norm_cls=norm_cls,
+                norm_moe=norm_moe,
+                fused_add_norm=config.fused_add_norm,
+                residual_in_fp32=config.residual_in_fp32,
+            )
+            else:
+                if config.rms_norm:
+                    norm_moe = partial(RMSNorm, eps=config.layernorm_epsilon)
+                else:
+                    norm_moe = partial(
+                        nn.LayerNorm if not config.rms_norm else RMSNorm, eps=config.layernorm_epsilon
+                    )
+                mixer_cls = partial(MambaLayer, layer_idx=layer_idx)
+                moe_cls = partial(SwitchMLP, layer_idx=layer_idx)
+                block = MambaBlockParallelMoe(
+                config,
+                mixer_cls=mixer_cls,
+                moe_cls=moe_cls,
+                norm_cls=norm_cls,
+                norm_moe=norm_moe,
+                fused_add_norm=config.fused_add_norm,
+                residual_in_fp32=config.residual_in_fp32,
+            )
+    else:
+        if config.mamba_moe_layers[layer_idx-1][0] == '1':
+            mixer_cls = partial(MLP, layer_idx=layer_idx)
+            block = MoEBlock(
+                config,
+                mixer_cls=mixer_cls,
+                norm_cls=norm_cls,
+                fused_add_norm=config.fused_add_norm,
+                residual_in_fp32=config.residual_in_fp32,
+            )
+        else:
+            mixer_cls = partial(SwitchMLP, layer_idx=layer_idx)
+            block = MoEBlock(
+                config,
+                mixer_cls=mixer_cls,
+                norm_cls=norm_cls,
+                fused_add_norm=config.fused_add_norm,
+                residual_in_fp32=config.residual_in_fp32,
+            )
+    block.layer_idx = layer_idx
+    return block
+class MambaDecoder(nn.Module):
+    """Class wrapping a decoder stack of mamba blocks."""
+    def __init__(
+        self,
+        config: MambaConfig,
+        post_layer_norm=True,
+        pre_process=True,
+        post_process=True,
+    ):
+        super().__init__()
+        self.config: MambaConfig = config
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.norm_cls = partial(nn.LayerNorm, eps=self.config.layernorm_epsilon)
+        self._build_layers()
+    def _build_layers(self):
+        num_layers_to_build = self.config.num_layers
+        # build the actual mamba layers
+        self.layers = torch.nn.ModuleList([create_block(self.config, i + 1) for i in range(num_layers_to_build)])
+        if self.post_process and self.post_layer_norm:
+            # Final layer norm before output.
+            self.final_layernorm = self.norm_cls(self.config.hidden_size, bias = True)
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+    def forward(self, hidden_states, residual = None, inference_params=None):
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+        residual = None
+        for i,layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states=hidden_states,
+                residual = residual,
+                inference_params=inference_params,
+            )
+        # Final layer norm.
+        if self.post_process and self.post_layer_norm:
+            if not self.config.fused_add_norm:
+                residual = (hidden_states + residual) if residual is not None else hidden_states
+                hidden_states = self.final_layernorm(residual.to(dtype=self.final_layernorm.weight.dtype))
+            else:
+                # Set prenorm=False here since we don't need the residual
+                fused_add_norm_fn = rms_norm_fn if isinstance(self.final_layernorm, RMSNorm) else layer_norm_fn
+                hidden_states = fused_add_norm_fn(
+                    hidden_states,
+                    self.final_layernorm.weight,
+                    self.final_layernorm.bias,
+                    eps=self.final_layernorm.eps,
+                    residual=residual,
+                    prenorm=False,
+                    residual_in_fp32=self.residual_in_fp32,
+                )
+        return hidden_states

mamba_config.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from dataclasses import dataclass
+from typing import Callable
+import torch
+import torch.nn.functional as F
+from utils import init_method_normal, scaled_init_method_normal
+@dataclass
+class MambaConfig():
+    base_model_type: str = "mamba"
+    num_layers: int = 0
+    hidden_size: int = 0
+    state_size: int = 0
+    vocab_size: int = 50000
+    expansion_factor: int = 2
+    conv_dimension: int = 0
+    conv_bias: bool = True
+    bias: bool = True
+    use_fast_path: bool = True
+    dt_rank: str = "auto"
+    dt_min: float = 0.001
+    dt_max: float = 0.1
+    dt_init: str = "random"
+    dt_scale: float = 1.0
+    dt_init_floor: float = 1e-4
+    rms_norm: bool = True
+    fused_add_norm: bool = False
+    residual_in_fp32: bool = True
+    hidden_dropout: float = 0.0
+    ffn_hidden_size: int = None
+    gated_linear_unit: bool = False
+    mamba_moe_layers: str = ""
+    routing_mode: str = "sinkhorn"
+    device: str = "cuda"
+    fp32_residual_connection: bool = False
+    layernorm_epsilon: float = 1e-5
+    layernorm_zero_centered_gamma: bool = False
+    add_bias_linear: bool = True
+    activation_func: Callable = F.gelu
+    num_moe_experts: int = None
+    # initialization
+    init_method: Callable = None
+    output_layer_init_method: Callable = None
+    init_method_std: float = 0.02
+    # mixed-precision
+    apply_query_key_layer_scaling: bool = True
+    attention_softmax_in_fp32: bool = True
+    # fusion
+    gated_linear_unit: bool = False
+    bias_gelu_fusion: bool = False
+    persist_layer_norm: bool = False
+    bias_dropout_fusion: bool = False
+    def __post_init__(self):
+        """ Python dataclass method that is used to modify attributes after initialization.
+            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        """
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        if self.ffn_hidden_size is None:
+            self.ffn_hidden_size = 4 * self.hidden_size
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        if self.bias_gelu_fusion:
+            if not self.add_bias_linear:
+                raise ValueError(
+                    "When bias_gelu_fusion is True, add_bias_linear must also be True."
+                )
+            if self.activation_func != F.gelu:
+                raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.')
+        if self.init_method is None:
+            self.init_method = init_method_normal(self.init_method_std)
+        if self.output_layer_init_method is None:
+            self.output_layer_init_method = scaled_init_method_normal(
+                self.init_method_std, self.num_layers
+            )

mamba_model.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import logging
+from typing import Literal, Optional, Union
+import functools
+from functools import partial
+import torch
+import torch.nn as nn
+from torch import Tensor
+import math
+import os
+from mamba_block import MambaBlock, MambaDecoder
+from mamba_config import MambaConfig
+from hf_utils import *
+import os, json
+from transformers.utils import WEIGHTS_NAME, CONFIG_NAME
+from transformers.utils.hub import cached_file
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+def _init_weights(
+    module,
+    n_layer,
+    initializer_range=0.02,  # Now only used for embedding layer.
+    rescale_prenorm_residual=True,
+    n_residuals_per_layer=1,  # Change to 2 if we have MLP
+):
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            if not getattr(module.bias, "_no_reinit", False):
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+    if rescale_prenorm_residual:
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                # We need to reinit p since this code could be called multiple times
+                # Having just p *= scale would repeatedly scale it down
+                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                with torch.no_grad():
+                    p /= math.sqrt(n_residuals_per_layer * n_layer)
+class MambaModel(nn.Module):
+    def __init__(
+        self,
+        config: MambaConfig,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = True,
+        initializer_cfg = None,
+    ) -> None:
+        super().__init__()
+        self.config: MambaConfig = config
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        if self.pre_process:
+            self.embedding = nn.Embedding(self.config.vocab_size, self.config.hidden_size)
+        self.decoder = MambaDecoder(
+            config = self.config,
+            pre_process = self.pre_process,
+            post_process = self.post_process,
+        )
+        if post_process:
+            self.output_layer = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias = self.config.add_bias_linear)
+            if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
+                self.initialize_last_stage_with_word_embeddings()
+        # apply weight initialization
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=self.config.num_layers,
+                **(initializer_cfg if initializer_cfg is not None else {}),
+            )
+        )
+    def initialize_last_stage_with_word_embeddings(self):
+        with torch.no_grad():
+            self.output_layer.weight = self.embedding.weight
+    def forward(
+        self,
+        input_ids,
+        position_ids = None,
+        decoder_input: Tensor = None,
+        labels: Tensor = None,
+        inference_params=None,
+    ) -> Tensor:
+        if decoder_input is not None:
+            pass
+        elif self.pre_process:
+            decoder_input = self.embedding(input_ids)
+        else:
+            decoder_input = None
+        hidden_states = self.decoder(
+            hidden_states=decoder_input,
+            residual=None,
+            inference_params=inference_params,
+        )
+        if not self.post_process:
+            return hidden_states
+        logits = self.output_layer(hidden_states)
+        return logits.contiguous()
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name = None, checkpoint_name=None, config_name=None, **kwargs):
+        if pretrained_model_name is not None:
+            json_config = load_config_hf(pretrained_model_name)
+            loaded = load_state_dict_hf(pretrained_model_name)
+        elif checkpoint_name is not None and config_name is not None:
+            with open(config_name, 'r') as f:
+                jsonstr = f.read()
+                json_config = json.loads(jsonstr)
+            loaded = torch.load(checkpoint_name, map_location='cpu')
+        else:
+            return
+        model_state_dict = loaded["model"]
+        config = MambaConfig(
+            num_layers=json_config['num_layers'],
+            hidden_size=json_config['hidden_size'],
+            state_size=json_config['state_size'],
+            conv_dimension=json_config['conv_dimension'],
+            vocab_size=json_config['vocab_size'],
+            expansion_factor=json_config['expansion_factor'],
+            mamba_moe_layers=json_config['mamba_moe_layers'],
+            ffn_hidden_size=json_config['ffn_hidden_size'],
+            bias = json_config['add_bias_linear'],
+            add_bias_linear = json_config['add_bias_linear'],
+            gated_linear_unit = json_config['swiglu']
+        )
+        model = MambaModel(config=config, max_sequence_length=json_config['max_sequence_length'], **kwargs)
+        # make keys match
+        model_state_dict["embedding.weight"] = model_state_dict["embedding.word_embeddings.weight"].clone()
+        model_state_dict["output_layer.weight"] = model_state_dict["embedding.word_embeddings.weight"].clone()
+        model_state_dict["embedding.word_embeddings.weight"] = None
+        model_state_dict.pop("embedding.word_embeddings.weight")
+        model.load_state_dict(loaded["model"])
+        return model
+    def save_pretrained(self, save_directory):
+        """
+        Minimal implementation of save_pretrained for MambaLMHeadModel.
+        Save the model and its configuration file to a directory.
+        """
+        # Ensure save_directory exists
+        if not os.path.exists(save_directory):
+            os.makedirs(save_directory)
+        # Save the model's state_dict
+        model_path = os.path.join(save_directory, 'pytorch_model.bin')
+        torch.save(self.state_dict(), model_path)
+        # Save the configuration of the model
+        config_path = os.path.join(save_directory, 'config.json')
+        with open(config_path, 'w') as f:
+            json.dump(self.config.__dict__, f)

mlp.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from dataclasses import dataclass
+from typing import Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils import bias_gelu_impl
+from mamba_config import MambaConfig
+class MLP(nn.Module):
+    def __init__(
+        self, config: MambaConfig, is_expert: bool = False, layer_idx=None
+    ):
+        super().__init__()
+        self.config: MambaConfig = config
+        self.layer = layer_idx
+        ffn_hidden_size_1 = self.config.ffn_hidden_size
+        ffn_hidden_size_2 = self.config.ffn_hidden_size
+        # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        if self.config.gated_linear_unit:
+            ffn_hidden_size_1 *= 2
+        self.linear_fc1 = nn.Linear(self.config.hidden_size, ffn_hidden_size_1, bias = self.config.add_bias_linear, device = self.config.device)
+        self.linear_fc1.is_expert = is_expert
+        if self.config.gated_linear_unit:
+            def glu(x):
+                x = torch.chunk(x, 2, dim=-1)
+                return self.config.activation_func(x[0]) * x[1]
+            self.activation_func = glu
+        else:
+            self.activation_func = self.config.activation_func
+        self.linear_fc2 = nn.Linear(ffn_hidden_size_2, self.config.hidden_size, bias = self.config.add_bias_linear, device = self.config.device)
+    def forward(self, hidden_states, inference_params=None):
+        intermediate = self.linear_fc1(hidden_states)
+        intermediate = self.activation_func(intermediate)
+        output = self.linear_fc2(intermediate)
+        return output

setup.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (c) 2023, Albert Gu, Tri Dao.
+import warnings
+import os
+from pathlib import Path
+from packaging.version import parse, Version
+from setuptools import setup, find_packages
+import subprocess
+import torch
+from torch.utils.cpp_extension import (
+    BuildExtension,
+    CppExtension,
+    CUDAExtension,
+    CUDA_HOME,
+)
+PACKAGE_NAME = "blackmamba"
+VERSION = "0.0.1"
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+# ninja build does not work unless include_dirs are abs path
+this_dir = os.path.dirname(os.path.abspath(__file__))
+# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
+# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
+FORCE_BUILD = os.getenv("MAMBA_FORCE_BUILD", "FALSE") == "TRUE"
+SKIP_CUDA_BUILD = os.getenv("MAMBA_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
+# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI
+FORCE_CXX11_ABI = os.getenv("MAMBA_FORCE_CXX11_ABI", "FALSE") == "TRUE"
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    bare_metal_version = parse(output[release_idx].split(",")[0])
+    return raw_output, bare_metal_version
+def check_if_cuda_home_none(global_option: str) -> None:
+    if CUDA_HOME is not None:
+        return
+    # warn instead of error because user could be downloading prebuilt wheels, so nvcc won't be necessary
+    # in that case.
+    warnings.warn(
+        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
+        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
+        "only images whose names contain 'devel' will provide nvcc."
+    )
+def append_nvcc_threads(nvcc_extra_args):
+    return nvcc_extra_args + ["--threads", "4"]
+ext_modules = []
+if not SKIP_CUDA_BUILD:
+    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+    check_if_cuda_home_none(PACKAGE_NAME)
+    # Check, if CUDA11 is installed for compute capability 8.0
+    cc_flag = []
+    if CUDA_HOME is not None:
+        _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+        if bare_metal_version < Version("11.6"):
+            raise RuntimeError(
+                f"{PACKAGE_NAME} is only supported on CUDA 11.6 and above.  "
+                "Note: make sure nvcc has a supported version by running nvcc -V."
+            )
+    cc_flag.append("-gencode")
+    cc_flag.append("arch=compute_70,code=sm_70")
+    cc_flag.append("-gencode")
+    cc_flag.append("arch=compute_80,code=sm_80")
+    if bare_metal_version >= Version("11.8"):
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_90,code=sm_90")
+    # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as
+    # torch._C._GLIBCXX_USE_CXX11_ABI
+    # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920
+    if FORCE_CXX11_ABI:
+        torch._C._GLIBCXX_USE_CXX11_ABI = True
+    ext_modules.append(
+        CUDAExtension(
+            name="selective_scan_cuda",
+            sources=[
+                "csrc/selective_scan/selective_scan.cpp",
+                "csrc/selective_scan/selective_scan_fwd_fp32.cu",
+                "csrc/selective_scan/selective_scan_fwd_fp16.cu",
+                "csrc/selective_scan/selective_scan_fwd_bf16.cu",
+                "csrc/selective_scan/selective_scan_bwd_fp32_real.cu",
+                "csrc/selective_scan/selective_scan_bwd_fp32_complex.cu",
+                "csrc/selective_scan/selective_scan_bwd_fp16_real.cu",
+                "csrc/selective_scan/selective_scan_bwd_fp16_complex.cu",
+                "csrc/selective_scan/selective_scan_bwd_bf16_real.cu",
+                "csrc/selective_scan/selective_scan_bwd_bf16_complex.cu",
+            ],
+            extra_compile_args={
+                "cxx": ["-O3", "-std=c++17"],
+                "nvcc": append_nvcc_threads(
+                    [
+                        "-O3",
+                        "-std=c++17",
+                        "-U__CUDA_NO_HALF_OPERATORS__",
+                        "-U__CUDA_NO_HALF_CONVERSIONS__",
+                        "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+                        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                        "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+                        "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+                        "--expt-relaxed-constexpr",
+                        "--expt-extended-lambda",
+                        "--use_fast_math",
+                        "--ptxas-options=-v",
+                        "-lineinfo",
+                    ]
+                    + cc_flag
+                ),
+            },
+            include_dirs=[Path(this_dir) / "csrc" / "selective_scan"],
+        )
+    )
+setup(
+    name=PACKAGE_NAME,
+    version=VERSION,
+    description="Blackmamba state-space + MoE model",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    packages=find_packages(include=['ops'],),
+    exclude=(
+            "csrc",
+            "blackmamba.egg-info",
+        ),
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": BuildExtension},
+    python_requires=">=3.7",
+    install_requires=[
+        "torch",
+        "packaging",
+        "ninja",
+        "einops",
+        "triton",
+        "transformers",
+        "causal_conv1d>=1.1.0",
+    ],
+)

switch_mlp.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+import torch.nn as nn
+import pickle
+import os
+import torch.nn.functional as F
+from mamba_config import MambaConfig
+from mlp import MLP
+def sinkhorn(cost, tol=0.0001):
+    "Sinkhorn based MoE routing function"
+    cost = torch.exp(2.0 * cost)
+    d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype)
+    # d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype)
+    d1 = 1 / (cost.size(1) * torch.sum(cost, 0))
+    eps = 0.00000001
+    error = 1e9
+    d1_old = d1
+    while error > tol:
+        d0 = (1 / d0.size(0)) * 1 / (torch.sum(d1 * cost, 1) + eps)
+        d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps)
+        error = torch.mean(torch.abs(d1_old - d1))
+        d1_old = d1
+    return d1 * cost * d0.unsqueeze(1)
+class SwitchMLP(nn.Module):
+    """
+    Top-1 Mixture of Experts Layer. Routes input to one of N MLP "experts"
+    Curently supports Sinkhorn based expert routing.
+    """
+    def __init__(self, config: MambaConfig, layer_idx=None):
+        super().__init__()
+        self.layer = layer_idx
+        self.config: MambaConfig = config
+        if config.mamba_moe_layers:
+            self.num_moe_experts = int(config.mamba_moe_layers[layer_idx-1][-1])
+        else:
+            self.num_moe_experts = self.config.num_moe_experts
+        self.router = torch.nn.Linear(self.config.hidden_size, self.num_moe_experts)
+        self.add_bias = config.add_bias_linear
+        self.routing = config.routing_mode # 'sinkhorn', 'top1', 'top2', 'sinkhorn_top2'
+        self.route_algo = sinkhorn
+        self.router_activation = torch.sigmoid
+        self.num_local_experts = self.num_moe_experts
+        self.local_expert_indices = [i for i in range(self.num_local_experts)]
+        self.local_experts = torch.nn.ModuleList()
+        for _ in range(self.num_local_experts):
+            expert = MLP(self.config, is_expert=True, layer_idx=layer_idx)
+            self.local_experts.append(expert)
+    def gather_indices(self, local_indices):
+        return local_indices
+    def forward(self, hidden_states, inference_params=None):
+        hidden_shape = hidden_states.shape
+        route = self.router(hidden_states)
+        route = route.view(-1, self.num_moe_experts)
+        if self.routing == 'sinkhorn':
+            route = self.router_activation(route)
+            max_prob, max_ind = torch.max(route, dim=1)
+        else:
+            route = torch.softmax(route, dim=1)
+            max_prob, max_ind = torch.max(route, dim=1)
+        max_prob = torch.unsqueeze(max_prob, 1)
+        hidden_states = hidden_states.view(-1, hidden_shape[-1])
+        global_hidden_states = hidden_states
+        global_indices = max_ind
+        output_total = torch.zeros_like(global_hidden_states)
+        for expert_num, expert in enumerate(self.local_experts):
+            local_expert_index = self.local_expert_indices[expert_num]
+            local_indices = (global_indices == local_expert_index).nonzero()
+            hidden = global_hidden_states[local_indices, :]
+            output = expert(hidden)
+            output_total[local_indices, :] = output
+        output_total = output_total * max_prob
+        output_total = output_total.view(hidden_shape)
+        return output_total

utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from operator import itemgetter
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+import math
+import torch
+def attention_mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
+def openai_gelu(x):
+    return gelu_impl(x)
+@torch.jit.script
+def bias_gelu(bias, y):
+    x = bias + y
+    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
+    return ff * g
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+bias_gelu_impl = GeLUFunction.apply
+# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+@torch.jit.script
+def erf_gelu(x):
+    return (
+        x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype))
+    )
+def init_method_normal(sigma):
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+    return init_
+def scaled_init_method_normal(sigma, num_layers):
+    std = sigma / math.sqrt(2.0 * num_layers)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+    return init_