easydel.inference.vinference.api_server.init

easydel.inference.vinference.api_server.init#

class easydel.inference.vinference.api_server.__init__.ChatCompletionRequest(*, model: str, messages: ~typing.List[~easydel.inference.vinference.api_server.api_models.ChatMessage], max_tokens: int = 16, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repetition_penalty: float = 1.0, temperature: float = 0.0, top_p: float = 1.0, top_k: int = 0, min_p: float = 0.0, suppress_tokens: ~typing.List[int] = <factory>, functions: ~typing.Optional[~typing.List[~easydel.inference.vinference.api_server.api_models.FunctionDefinition]] = None, function_call: ~typing.Optional[~typing.Union[str, ~typing.Dict[str, ~typing.Any]]] = None, tools: ~typing.Optional[~typing.List[~easydel.inference.vinference.api_server.api_models.ToolDefinition]] = None, tool_choice: ~typing.Optional[~typing.Union[str, ~typing.Dict[str, ~typing.Any]]] = None, n: ~typing.Optional[int] = 1, stream: ~typing.Optional[bool] = False, stop: ~typing.Optional[~typing.Union[str, ~typing.List[str]]] = None, logit_bias: ~typing.Optional[~typing.Dict[str, float]] = None, user: ~typing.Optional[str] = None)[source]#

Bases: BaseModel

Represents a request to the chat completion endpoint. Mirrors the OpenAI ChatCompletion request structure.

frequency_penalty: float#

function_call: Optional[Union[str, Dict[str, Any]]]#

functions: Optional[List[FunctionDefinition]]#

logit_bias: Optional[Dict[str, float]]#

max_tokens: int#

messages: List[ChatMessage]#

min_p: float#

model: str#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

n: Optional[int]#

presence_penalty: float#

repetition_penalty: float#

stop: Optional[Union[str, List[str]]]#

stream: Optional[bool]#

suppress_tokens: List[int]#

temperature: float#

tool_choice: Optional[Union[str, Dict[str, Any]]]#

tools: Optional[List[ToolDefinition]]#

top_k: int#

top_p: float#

user: Optional[str]#

class easydel.inference.vinference.api_server.__init__.ChatCompletionResponse(*, id: str = <factory>, object: str = 'chat.completion', created: int = <factory>, model: str, choices: ~typing.List[~easydel.inference.vinference.api_server.api_models.ChatCompletionResponseChoice], usage: ~easydel.inference.vinference.api_server.api_models.UsageInfo)[source]#

Bases: BaseModel

Represents a non-streaming response from the chat completion endpoint.

choices: List[ChatCompletionResponseChoice]#

created: int#

id: str#

model: str#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

object: str#

usage: UsageInfo#

class easydel.inference.vinference.api_server.__init__.ChatCompletionResponseChoice(*, index: int, message: ChatMessage, finish_reason: Optional[Literal['stop', 'length', 'function_call']] = None)[source]#

Bases: BaseModel

Represents a single choice within a non-streaming chat completion response.

finish_reason: Optional[Literal['stop', 'length', 'function_call']]#

index: int#

message: ChatMessage#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class easydel.inference.vinference.api_server.__init__.ChatCompletionStreamResponse(*, id: str = <factory>, object: str = 'chat.completion.chunk', created: int = <factory>, model: str, choices: ~typing.List[~easydel.inference.vinference.api_server.api_models.ChatCompletionStreamResponseChoice], usage: ~easydel.inference.vinference.api_server.api_models.UsageInfo)[source]#

Bases: BaseModel

Represents a single chunk in a streaming response from the chat completion endpoint.

choices: List[ChatCompletionStreamResponseChoice]#

created: int#

id: str#

model: str#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

object: str#

usage: UsageInfo#

class easydel.inference.vinference.api_server.__init__.ChatCompletionStreamResponseChoice(*, index: int, delta: DeltaMessage, finish_reason: Optional[Literal['stop', 'length', 'function_call']] = None)[source]#

Bases: BaseModel

Represents a single choice within a streaming chat completion response chunk.

delta: DeltaMessage#

finish_reason: Optional[Literal['stop', 'length', 'function_call']]#

index: int#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class easydel.inference.vinference.api_server.__init__.ChatMessage(*, role: str, content: Union[str, List[Mapping[str, str]]], name: Optional[str] = None, function_call: Optional[Dict[str, Any]] = None)[source]#

Bases: BaseModel

Represents a single message in a chat conversation.

content: Union[str, List[Mapping[str, str]]]#

function_call: Optional[Dict[str, Any]]#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

name: Optional[str]#

role: str#

class easydel.inference.vinference.api_server.__init__.CountTokenRequest(*, model: str, conversation: Union[str, List[ChatMessage]])[source]#

Bases: BaseModel

Represents a request to the token counting endpoint.

conversation: Union[str, List[ChatMessage]]#

model: str#

model_config: ClassVar[ConfigDict] = {}#: Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class easydel.inference.vinference.api_server.__init__.vInferenceApiServer(inference_map: Union[Dict[str, Any], Any] = None, inference_init_call: Optional[Callable[[], Any]] = None, max_workers: int = 10)[source]#

Bases: object

FastAPI server for serving vInference instances.

This server provides endpoints mimicking the OpenAI API structure for chat completions, liveness/readiness checks, token counting, and listing available models. It handles both streaming and non-streaming requests asynchronously using a thread pool.

async available_inference()[source]#: Lists available models (GET /v1/models).

async chat_completions(request: ChatCompletionRequest)[source]#

Handles chat completion requests (POST /v1/chat/completions).

Validates the request, retrieves the appropriate vInference model, tokenizes the input, and delegates to streaming or non-streaming handlers.

Parameters

request (ChatCompletionRequest) – The incoming request data.

Returns

The generated response, either: a complete JSON object or a streaming event-stream.

Return type

Union[JSONResponse, StreamingResponse]

async completions(request: CompletionRequest)[source]#

Handles completion requests (POST /v1/completions).

Processes the prompt for completion and returns generated text.

Parameters: request (CompletionRequest) – The incoming request data.
Returns: The generated response.
Return type: Union[JSONResponse, StreamingResponse]

async count_tokens(request: CountTokenRequest)[source]#: Token counting endpoint (POST /v1/count_tokens).

fire(host='0.0.0.0', port=11556, metrics_port: Optional[int] = None, log_level='info', ssl_keyfile: Optional[str] = None, ssl_certfile: Optional[str] = None)[source]#

Starts the uvicorn server to run the FastAPI application.

Parameters

host (str) – The host address to bind to. Defaults to “0.0.0.0”.
port (int) – The port to listen on. Defaults to 11556.
metrics_port (tp.Optional[int]) – The port for the Prometheus metrics server. If None, defaults to port + 1. Set to -1 to disable.
log_level (str) – The logging level for uvicorn. Defaults to “info”.
ssl_keyfile (tp.Optional[str]) – Path to the SSL key file for HTTPS.
ssl_certfile (tp.Optional[str]) – Path to the SSL certificate file for HTTPS.

async liveness()[source]#: Liveness check endpoint (GET /liveness).

async readiness()[source]#: Readiness check endpoint (GET /readiness).

class easydel.inference.vinference.api_server.__init__.vInferenceChatCompletionClient(base_url: str, max_retries: int = 5, timeout: float = 30.0)[source]#

Bases: object

Client for interacting with the vInference Chat Completion API endpoint.

This client handles communication with the vInference server, including sending requests, handling responses (streaming or non-streaming), managing retries, and parsing errors.

create_chat_completion(request: ChatCompletionRequest, extra_headers: Optional[dict] = None) → Generator[Union[ChatCompletionStreamResponse, ChatCompletionResponse], None, None][source]#

Sends a chat completion request to the vInference API.

Handles both streaming and non-streaming responses based on the stream attribute in the request object.

Parameters

request (ChatCompletionRequest) – The chat completion request object.
extra_headers (tp.Optional[dict]) – Optional dictionary of extra headers to include in the request. Defaults to None.

Yields

tp.Union[ChatCompletionStreamResponse, ChatCompletionResponse] – For streaming requests, yields ChatCompletionStreamResponse objects for each chunk received. For non-streaming requests, yields a single ChatCompletionResponse object.

Raises

vInferenceAPIError – If the API returns an error status code or if there’s an issue parsing the response.
requests.RequestException – For underlying network connection issues.

easydel.inference.vinference.api_server.__init__

Contents

easydel.inference.vinference.api_server.__init__#

easydel.inference.vinference.api_server.init

easydel.inference.vinference.api_server.init#