easydel.inference.vinference.__init__#

class easydel.inference.vinference.__init__.vInference(model: None, processor_class: None, generation_config: Optional[vInferenceConfig] = None, seed: Optional[int] = None, input_partition_spec: Optional[PartitionSpec] = None, max_new_tokens: int = 512, inference_name: Optional[str] = None)[source]#

Bases: object

Class for performing text generation using a pre-trained language graphdef in EasyDeL.

This class handles the generation process, including initialization, precompilation, and generating text in streaming chunks.

property SEQUENCE_DIM_MAPPING#
count_tokens(messages: List[Dict[str, str]])[source]#
count_tokens(text: str)
generate(input_ids: Array, attention_mask: Optional[Array] = None, *, graphstate: Optional[State[Key, VariableState[Any]]] = None, graphother: Optional[State[Key, VariableState[Any]]] = None, **model_kwargs) Generator[Union[SampleState, Any], SampleState, SampleState][source]#

Generates text in streaming chunks with comprehensive input adjustment.

Parameters
  • input_ids โ€“ Input token IDs as a JAX array

  • attention_mask โ€“ Optional attention mask for the input

  • graphstate (nn.GraphState, optional) โ€“ in case that you want to update model state for generation.

  • graphother (nn.GraphState, optional) โ€“ in case that you want to update model ostate for generation.

  • **model_kwargs โ€“ Additional model-specific keyword arguments

Returns

Generator yielding SampleState objects containing generation results and metrics

property inference_name#
classmethod load_inference(path: Union[PathLike, str], model: None, processor_class: None)[source]#
property metrics#
property model#
property model_prefill_length: int#

Calculate the maximum length available for input prefill by subtracting the maximum new tokens from the modelโ€™s maximum sequence length.

Returns

The maximum length available for input prefill

Return type

int

Raises

ValueError โ€“ If no maximum sequence length configuration is found

precompile(batch_size: Union[int, List[int]] = 1, input_tokens_length: Optional[Union[int, List[int]]] = None)[source]#

Precompiles the generation functions for a given batch size and input length.

This function checks if the generation functions have already been compiled for the given configuration. If not, it compiles them asynchronously and stores them in a cache.

Parameters
  • batch_size โ€“ The batch size.

  • input_tokens_length โ€“ The length of the input tokens.

Returns

True if precompilation was successful, False otherwise.

Return type

bool

save_inference(path: Union[PathLike, str])[source]#
property tokenizer#
class easydel.inference.vinference.__init__.vInferenceApiServer(inference_map: Union[Dict[str, Any], Any] = None, inference_init_call: Optional[Callable[[], Any]] = None, max_workers: int = 10)[source]#

Bases: object

available_inference()[source]#
async chat_completions(request: ChatCompletionRequest)[source]#
count_tokens(request: CountTokenRequest)[source]#
fire(host='0.0.0.0', port=11556, metrics_port: Optional[int] = None, log_level='debug')[source]#
liveness()[source]#
patch_endpoints()[source]#

Register all endpoints with the FastAPI app.

readiness()[source]#
class easydel.inference.vinference.__init__.vInferenceConfig(max_new_tokens: int = 64, min_length: Optional[int] = None, streaming_chunks: int = 16, temperature: float = 0.0, top_p: float = 0.95, top_k: int = 50, do_sample: bool = True, no_repeat_ngram_size: Optional[int] = None, num_return_sequences: Union[int, Dict[int, int], NoneType] = 1, suppress_tokens: Optional[list] = None, forced_bos_token_id: Optional[int] = None, forced_eos_token_id: Optional[int] = None, pad_token_id: Optional[int] = None, bos_token_id: Optional[int] = None, eos_token_id: Union[int, List[int], NoneType] = None, partition_rules: Optional[Tuple[Tuple[str, Any]]] = None, partition_axis: Optional[eformer.escale.partition.constraints.PartitionAxis] = None, _loop_rows: Optional[int] = None)[source]#

Bases: object

bos_token_id: Optional[int] = None#
do_sample: bool = True#
eos_token_id: Optional[Union[int, List[int]]] = None#
forced_bos_token_id: Optional[int] = None#
forced_eos_token_id: Optional[int] = None#
get_logits_processor()[source]#
get_logits_warper()[source]#
get_partition_rules(runtime_config: Optional[Tuple[int, int]] = None)[source]#
max_new_tokens: int = 64#
min_length: Optional[int] = None#
no_repeat_ngram_size: Optional[int] = None#
num_return_sequences: Optional[Union[int, Dict[int, int]]] = 1#
pad_token_id: Optional[int] = None#
partition_axis: Optional[PartitionAxis] = None#
partition_rules: Optional[Tuple[Tuple[str, Any]]] = None#
streaming_chunks: int = 16#
suppress_tokens: Optional[list] = None#
temperature: float = 0.0#
top_k: int = 50#
top_p: float = 0.95#
tree_flatten()[source]#
classmethod tree_unflatten(aux, children)[source]#