easydel.modules.qwen3.qwen3_configuration

easydel.modules.qwen3.qwen3_configuration#

class easydel.modules.qwen3.qwen3_configuration.Qwen3Config(vocab_size=151936, hidden_size=4096, intermediate_size=22016, num_hidden_layers=32, num_attention_heads=32, num_key_value_heads=32, head_dim=128, hidden_act='silu', max_position_embeddings=32768, initializer_range=0.02, rms_norm_eps=1e-06, use_cache=True, tie_word_embeddings=False, rope_theta=10000.0, rope_scaling=None, attention_bias=False, use_sliding_window=False, sliding_window=4096, max_window_layers=28, attention_dropout=0.0, **kwargs)[source]#

Bases: EasyDeLBaseConfig

get_partition_rules(*args, **kwargs)[source]#

Get the partition rules for the model.

Parameters

fully_sharded_data_parallel (bool, optional, defaults to True) – Whether to use fully sharded data parallelism.

Returns

The partition rules.

Return type

tp.Tuple[tp.Tuple[str, PartitionSpec]]

model_type: str = 'qwen3'#