easydel.inference.esurge.config#

class easydel.inference.esurge.config.CacheConfig(num_pages: int | None, page_size: int, enable_prefix_caching: bool)[source]#

Bases: object

Configuration for the KV (key-value) cache.

Manages memory allocation and caching strategies for attention mechanisms.

num_pages#

Number of GPU pages allocated for cache (None for automatic).

Type

int | None

page_size#

Size of each cache page in tokens.

Type

int

enable_prefix_caching#

Enable caching of common prefixes across requests.

Type

bool

Example

>>> config = CacheConfig(
...     num_pages=1000,
...     page_size=16,
...     enable_prefix_caching=True
... )

Note

Page-based allocation allows efficient memory management and sharing of cache blocks between sequences.

enable_prefix_caching: bool#

A flag to enable or disable prefix caching.

num_pages: int | None#

The number of GPU pages allocated for the cache.

page_size: int#

The size of each cache page.

class easydel.inference.esurge.config.Config(scheduler_config: SchedulerConfig, cache_config: CacheConfig, speculative_config: easydel.inference.esurge.config.SpeculativeConfig | None = None)[source]#

Bases: object

Unified configuration for the eSurge engine.

Combines scheduler and cache configurations into a single object.

scheduler_config#

Configuration for request scheduling.

Type

easydel.inference.esurge.config.SchedulerConfig

cache_config#

Configuration for KV cache management.

Type

easydel.inference.esurge.config.CacheConfig

speculative_config#

Configuration for speculative decoding.

Type

easydel.inference.esurge.config.SpeculativeConfig | None

Example

>>> config = Config(
...     scheduler_config=SchedulerConfig(...),
...     cache_config=CacheConfig(...),
...     speculative_config=SpeculativeConfig(num_speculative_tokens=5)
... )
cache_config: CacheConfig#

Nested configuration for the cache.

scheduler_config: SchedulerConfig#

Nested configuration for the scheduler.

speculative_config: easydel.inference.esurge.config.SpeculativeConfig | None = None#

Nested configuration for speculative decoding.

class easydel.inference.esurge.config.SchedulerConfig(max_num_seqs: int, max_num_batched_tokens: int, max_model_len: int, policy: Literal['priority', 'fcfs'] = 'fcfs', long_prefill_token_threshold: int = 256, chunked_prefill_enabled: bool = False, token_safety_margin: int | None = None, max_num_seq_buckets: tuple[int, ...] | None = None, async_scheduling: bool = False)[source]#

Bases: object

Configuration for the request scheduler.

Controls how requests are scheduled and batched for processing.

max_num_seqs#

Maximum number of sequences running simultaneously.

Type

int

max_num_batched_tokens#

Maximum tokens processed in a single batch.

Type

int

max_model_len#

Maximum input length the model can handle.

Type

int

policy#

Scheduling policy (‘fcfs’ for first-come-first-served, ‘priority’ for priority-based).

Type

Literal[‘priority’, ‘fcfs’]

long_prefill_token_threshold#

Token count threshold for identifying long prefill requests.

Type

int

chunked_prefill_enabled#

Enable chunked processing of long prefill requests.

Type

bool

Example

>>> config = SchedulerConfig(
...     max_num_seqs=16,
...     max_num_batched_tokens=2048,
...     max_model_len=8192,
...     policy="priority"
... )
async_scheduling: bool = False#

Enable async token sampling to overlap with next forward pass (30-40% latency reduction).

chunked_prefill_enabled: bool = False#

A flag to enable or disable chunked prefilling.

long_prefill_token_threshold: int = 256#

A token threshold for handling long prefill requests.

max_model_len: int#

The maximum length of the model’s input.

max_num_batched_tokens: int#

The maximum number of tokens to be processed in a single batch.

max_num_seq_buckets: tuple[int, ...] | None = None#

Optional explicit request-capacity buckets (e.g., (8, 16, 32, 64)).

max_num_seqs: int#

The maximum number of sequences running at the same time.

policy: Literal['priority', 'fcfs'] = 'fcfs'#

The scheduling policy to use, such as ‘priority’ or ‘fcfs’.

token_safety_margin: int | None = None#

Reserved tokens per running request to prevent over-allocation.

class easydel.inference.esurge.config.SpeculativeConfig(num_speculative_tokens: int = 0, speculative_model: str | None = None)[source]#

Bases: object

Configuration for speculative decoding.

num_speculative_tokens#

Number of speculative tokens to generate.

Type

int

speculative_model#

Path to the speculative model (e.g., Eagle model).

Type

str | None

num_speculative_tokens: int = 0#
speculative_model: str | None = None#
use_eagle() bool[source]#

Check if Eagle speculative decoding is enabled.