# Copyright 2023 The EASYDEL Author @erfanzar (Erfan Zare Chavoshi).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dbrx configuration."""
import typing as tp
import warnings
from jax.sharding import PartitionSpec
from easydel.infra.base_module import EasyDeLBaseConfig
from easydel.infra.etils import EasyDeLGradientCheckPointers
from easydel.infra.factory import register_config
DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
[docs]class DbrxAttentionConfig(EasyDeLBaseConfig):
"""
This is the configuration class to store the attention related configuration of a [`DbrxModel`].
Args:
attn_pdrop (`float`, *optional*, defaults to 0.0):
The dropout probability applied to the attention output.
clip_qkv (`float`, *optional*, defaults to 8.0):
The clip value applied to the query, key, and value tensors.
kv_n_heads (`int`, *optional*, defaults to 1):
The number of attention heads for the key and value tensors.
rope_theta (`float`, *optional*, defaults to 10000.0):
The theta value for the rotary position embedding.
"""
def __init__(
self,
attn_pdrop: float = 0,
clip_qkv: tp.Optional[float] = 8,
kv_n_heads: int = 1,
rope_theta: float = 10000.0,
**kwargs: tp.Any,
):
super().__init__(**kwargs)
self.attn_pdrop = attn_pdrop
self.clip_qkv = clip_qkv
self.kv_n_heads = kv_n_heads
self.rope_theta = rope_theta
for k in ["model_type"]:
if k in kwargs:
kwargs.pop(k)
if len(kwargs) != 0:
raise ValueError(f"Found unknown {kwargs=}")
[docs] @classmethod
def from_pretrained(
cls, pretrained_model_name_or_path: str, **kwargs: tp.Any
) -> "PretrainedConfig": # type: ignore[misc] # noqa: F821
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "dbrx":
config_dict = config_dict["attn_config"]
if (
"model_type" in config_dict
and hasattr(cls, "model_type")
and config_dict["model_type"] != cls.model_type
):
warnings.warn(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors.",
stacklevel=1,
)
return cls.from_dict(config_dict, **kwargs)
[docs]class DbrxFFNConfig(EasyDeLBaseConfig):
"""
This is the configuration class to store the feed forward related configuration of a [`DbrxModel`].
Args:
ffn_act_fn (`dict`, *optional*):
The activation function configuration for the feed-forward network.
ffn_hidden_size (`int`, *optional*, defaults to 3584):
The hidden size of the feed-forward network.
moe_num_experts (`int`, *optional*, defaults to 4):
The number of experts in the Mixture-of-Experts (MoE) layer.
moe_top_k (`int`, *optional*, defaults to 1):
The number of top experts to use in the MoE layer.
moe_jitter_eps (`float`, *optional*):
The jitter epsilon value for the MoE layer.
moe_loss_weight (`float`, *optional*, defaults to 0.01):
The loss weight for the MoE auxiliary loss.
moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0):
The normalization factor for the expert weights in the MoE layer.
uniform_expert_assignment (`bool`, *optional*, defaults to `False`):
Whether to use uniform expert assignment in the MoE layer.
"""
def __init__(
self,
ffn_act_fn: tp.Optional[dict] = None,
ffn_hidden_size: int = 3584,
moe_num_experts: int = 4,
moe_top_k: int = 1,
moe_jitter_eps: tp.Optional[float] = None,
moe_loss_weight: float = 0.01,
moe_normalize_expert_weights: tp.Optional[float] = 1,
uniform_expert_assignment: bool = False,
**kwargs: tp.Any,
):
super().__init__()
if ffn_act_fn is None:
ffn_act_fn = {"name": "silu"}
self.ffn_act_fn = ffn_act_fn
self.ffn_hidden_size = ffn_hidden_size
self.moe_num_experts = moe_num_experts
self.moe_top_k = moe_top_k
self.moe_jitter_eps = moe_jitter_eps
self.moe_loss_weight = moe_loss_weight
self.moe_normalize_expert_weights = moe_normalize_expert_weights
self.uniform_expert_assignment = uniform_expert_assignment
for k in ["model_type"]:
if k in kwargs:
kwargs.pop(k)
if len(kwargs) != 0:
raise ValueError(f"Found unknown {kwargs=}")
[docs] @classmethod
def from_pretrained(
cls, pretrained_model_name_or_path: str, **kwargs: tp.Any
) -> "EasyDeLBaseConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "dbrx":
config_dict = config_dict["ffn_config"]
if (
"model_type" in config_dict
and hasattr(cls, "model_type")
and config_dict["model_type"] != cls.model_type
):
warnings.warn(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors.",
stacklevel=1,
)
return cls.from_dict(config_dict, **kwargs)
[docs]@register_config("dbrx")
class DbrxConfig(EasyDeLBaseConfig):
"""
Configuration objects inherit from [`EasyDeLBaseConfig`] and can be used to control the model outputs. Read
the documentation from [`EasyDeLBaseConfig`] for more information.
Args:
d_model (`int`, *optional*, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer.
n_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
n_layers (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
max_seq_len (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 2048 or 4096).
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the DBRX model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed to the forward method.
resid_pdrop (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
emb_pdrop (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
attn_config ([`DbrxAttentionConfig`], *optional*):
The configuration of the attention layer.
ffn_config ([`DbrxFFNConfig`], *optional*):
The configuration of the feed forward layer.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
output_router_logits (`bool`, *optional*, defaults to `False`):
Whether or not to output the router logits.
router_aux_loss_coef (`float`, *optional*, defaults to 0.05):
The coefficient of the router auxiliary loss.
"""
model_type: str = "dbrx"
attribute_map = {
"num_attention_heads": "n_heads",
"hidden_size": "d_model",
"num_hidden_layers": "n_layers",
"max_position_embeddings": "max_seq_len",
}
def __init__(
self,
d_model: int = 2048,
n_heads: int = 16,
n_layers: int = 24,
max_seq_len: int = 2048,
vocab_size: int = 32000,
resid_pdrop: float = 0.0,
emb_pdrop: float = 0.0,
attn_config: tp.Optional[DbrxAttentionConfig] = None,
ffn_config: tp.Optional[DbrxFFNConfig] = None,
use_cache: bool = True,
initializer_range: float = 0.02,
output_router_logits: bool = False,
router_aux_loss_coef: float = 0.05,
gradient_checkpointing: EasyDeLGradientCheckPointers = EasyDeLGradientCheckPointers.NONE,
**kwargs: tp.Any,
):
"""Initialize the DbrxConfig with the specified parameters.
Args:
d_model (int, optional): Hidden size for the transformer. Defaults to 2048.
n_heads (int, optional): Number of attention heads. Defaults to 16.
n_layers (int, optional): Number of transformer layers. Defaults to 24.
max_seq_len (int, optional): Maximum sequence length. Defaults to 2048.
vocab_size (int, optional): Vocabulary size. Defaults to 32000.
resid_pdrop (float, optional): Residual dropout probability. Defaults to 0.0.
emb_pdrop (float, optional): Embedding dropout probability. Defaults to 0.0.
attn_config (Optional[DbrxAttentionConfig], optional): Configuration for attention. Defaults to None.
ffn_config (Optional[DbrxFFNConfig], optional): Configuration for feed-forward network. Defaults to None.
use_cache (bool, optional): Whether to use KV cache for decoding. Defaults to True.
initializer_range (float, optional): Range for weight initialization. Defaults to 0.02.
output_router_logits (bool, optional): Whether to output router logits. Defaults to False.
router_aux_loss_coef (float, optional): Coefficient for router auxiliary loss. Defaults to 0.05.
gradient_checkpointing (EasyDeLGradientCheckPointers, optional): Gradient checkpointing strategy. Defaults to EasyDeLGradientCheckPointers.NONE.
**kwargs (Any): Additional arguments.
"""
if attn_config is None:
self.attn_config = DbrxAttentionConfig()
elif isinstance(attn_config, dict):
self.attn_config = DbrxAttentionConfig(**attn_config)
else:
self.attn_config = attn_config
if ffn_config is None:
self.ffn_config = DbrxFFNConfig()
elif isinstance(ffn_config, dict):
self.ffn_config = DbrxFFNConfig(**ffn_config)
else:
self.ffn_config = ffn_config
self.d_model = d_model
self.n_heads = n_heads
self.n_layers = n_layers
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.resid_pdrop = resid_pdrop
self.emb_pdrop = emb_pdrop
self.use_cache = use_cache
self.initializer_range = initializer_range
self.output_router_logits = output_router_logits
self.router_aux_loss_coef = router_aux_loss_coef
self.gradient_checkpointing = gradient_checkpointing
tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
if tie_word_embeddings:
raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
super().__init__(
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
@property
def granted_freq_max_position_embedding(self) -> int:
"""Returns the maximum position embedding size for frequency-based position embeddings.
Returns:
int: The maximum position embedding size, falling back to max_seq_len if not explicitly set.
"""
return getattr(
self,
"freq_max_position_embeddings",
self.max_position_embeddings,
)
@property
def granted_mask_max_position_embedding(self) -> int:
"""Returns the maximum position embedding size for mask-based position embeddings.
Returns:
int: The maximum position embedding size, falling back to max_seq_len if not explicitly set.
"""
return getattr(
self,
"mask_max_position_embeddings",
self.max_position_embeddings,
)
[docs] def get_partition_rules(self, *args, **kwargs):
"""Get the partition rules for the model parameters.
These rules define how parameters should be sharded across devices when using model parallelism.
Args:
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Returns:
Tuple: A tuple of partition rules for different parameter patterns.
"""
return (
# Embeddings
("wte/embedding", PartitionSpec(("fsdp", "sp"), "tp")),
# Language model head
("lm_head/kernel", PartitionSpec(("fsdp", "sp"), "tp")),
("lm_head/bias", PartitionSpec(None)),
# Attention layers
("norm_attn_norm/attn/Wqkv/kernel", PartitionSpec(("fsdp", "sp"), "tp")),
("norm_attn_norm/attn/Wqkv/bias", PartitionSpec(None)),
("norm_attn_norm/attn/out_proj/kernel", PartitionSpec("tp", ("fsdp", "sp"))),
("norm_attn_norm/attn/out_proj/bias", PartitionSpec(None)),
# MoE FFN layers
("ffn/experts/mlp/(v1|w1|w2)", PartitionSpec(("fsdp", "sp"), "tp")),
("ffn/router/layer/kernel", PartitionSpec(("fsdp", "sp"), "tp")),
("ffn/router/layer/bias", PartitionSpec(None)),
# Layer norms
("norm_attn_norm/norm_\d+/(bias|scale)", PartitionSpec(None)),
("transformer/norm_f/(bias|scale)", PartitionSpec(None)),
# Catch-all
(".*", PartitionSpec(None)),
)