Source code for easydel.modules.mistral3.mistral3_tokenizer

# Copyright 2025 The EasyDeL Author @erfanzar (Erfan Zare Chavoshi).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any

import numpy as np

try:
    from mistral_common.tokens.tokenizers.mistral import ChatCompletionRequest, MistralTokenizer, SpecialTokenPolicy
except ImportError:
    ChatCompletionRequest, MistralTokenizer, SpecialTokenPolicy = (
        type(None),
        type(None),
        type(None),
    )


[docs]class Mistral3Tokenizer:
    """
    A wrapper class to make the `mistral-common` tokenizer behave like a
    Hugging Face `transformers` tokenizer. This is useful for maintaining a
    consistent API in projects that might use various tokenizers.

    Attributes:
        mistral_tokenizer: The original MistralTokenizer instance.
        pad_token_id: The ID of the padding token.
        eos_token_id: The ID of the end-of-sequence token.
        bos_token_id: The ID of the beginning-of-sequence token.
    """

    def __init__(self, mistral_tokenizer: MistralTokenizer):  # type: ignore[no-untyped-def]
        if MistralTokenizer is None:
            raise ImportError("mistral-common is not installed. Please install it with `pip install mistral-common`.")
        self.mistral_tokenizer = mistral_tokenizer
        tokenizer = self.mistral_tokenizer.instruct_tokenizer.tokenizer
        self.pad_token_id = tokenizer.pad_id
        self.eos_token_id = tokenizer.eos_id
        self.bos_token_id = tokenizer.bos_id
        self.tokenizer = tokenizer
        self.padding_side = "left"

[docs]    def encode(self, text: str, add_special_tokens: bool = True) -> list[int]:
        """
        Encodes a single string into a list of token IDs.

        This method maps the `add_special_tokens` flag to the `bos` and `eos`
        arguments of the underlying Mistral tokenizer.

        Args:
            text: The input text to encode.
            add_special_tokens: Whether to add special tokens (BOS/EOS).

        Returns:
            A list of token IDs.
        """
        return self.tokenizer.encode(text, bos=add_special_tokens, eos=add_special_tokens)

[docs]    def decode(self, token_ids: list[int], skip_special_tokens: bool = True) -> str:
        """
        Decodes a list of token IDs back into a string.

        Args:
            token_ids: The list of token IDs to decode.
            skip_special_tokens: Whether to remove special tokens from the
                                 decoded string.

        Returns:
            The decoded text string.
        """
        policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
        return self.mistral_tokenizer.decode(token_ids, policy)

[docs]    def apply_chat_template(
        self,
        conversation: list[dict[str, str]],
        tokenize: bool = True,
        add_special_tokens: bool = True,
        padding: bool = False,
        truncation: bool = False,
        max_length: int | None = None,
        return_tensors: str | None = None,
        **kwargs,
    ) -> str | list[int] | dict[str, Any]:
        """
        Applies a chat template to a conversation history.

        Args:
            conversation: A list of message dictionaries, each with 'role' and 'content'.
            tokenize: If False, returns the formatted string. If True, tokenizes it.
            add_special_tokens: Whether to add special tokens.
            padding: Whether to pad the sequences.
            truncation: Whether to truncate the sequences.
            max_length: The maximum length for truncation or padding.
            return_tensors: The tensor format for the output (e.g., 'np').

        Returns:
            The processed output, which can be a string, list of IDs, or a dict.
        """
        tokenized = self.mistral_tokenizer.encode_chat_completion(ChatCompletionRequest(messages=conversation))
        formatted_text = tokenized.text

        if not tokenize:
            return formatted_text

        return self.__call__(
            formatted_text,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            return_tensors=return_tensors,
            add_special_tokens=add_special_tokens,
            **kwargs,
        )

    def __call__(
        self,
        text: str | list[str],
        padding: bool | str = False,
        truncation: bool | str = False,
        max_length: int | None = None,
        return_tensors: str | None = None,
        add_special_tokens: bool = True,
        **kwargs,
    ) -> dict[str, Any]:
        """
        Tokenizes a single text or a batch of texts, with advanced options for
        padding and truncation, mimicking Hugging Face tokenizers.

        Args:
            text: A single string or a list of strings to tokenize.
            padding: Controls padding.
                - `False` or `'do_not_pad'`: No padding.
                - `True` or `'longest'`: Pad to the longest sequence in the batch.
                - `'max_length'`: Pad to `max_length`.
            truncation: Controls truncation.
                - `False` or `'do_not_truncate'`: No truncation.
                - `True` or `'longest_first'`: Truncate to `max_length`.
            max_length: The maximum sequence length. Required for truncation
                        and `padding='max_length'`.
            return_tensors: If 'np', returns numpy arrays. Otherwise, returns lists.
            add_special_tokens: Whether to add special tokens like BOS and EOS.

        Returns:
            A dictionary containing 'input_ids' and 'attention_mask'.
        """
        is_single_input = isinstance(text, str)
        batch_texts = [text] if is_single_input else text

        if not batch_texts:
            return {"input_ids": [], "attention_mask": []}

        batch_token_ids = [self.encode(txt, add_special_tokens=add_special_tokens) for txt in batch_texts]

        if truncation and max_length:
            batch_token_ids = [tokens[:max_length] for tokens in batch_token_ids]

        if padding:
            if self.pad_token_id is None:
                raise ValueError(
                    "Padding is enabled, but the tokenizer does not have a `pad_token_id`. You can set one manually."
                )

            if padding == "longest" or padding is True:
                max_len = max(len(tokens) for tokens in batch_token_ids)
            elif padding == "max_length":
                if max_length is None:
                    raise ValueError("You must specify `max_length` when using `padding='max_length'`.")
                max_len = max_length
            else:
                max_len = 0
            if max_len > 0:
                if self.padding_side not in ["left", "right"]:
                    raise ValueError(f"padding_side must be 'left' or 'right', but got '{self.padding_side}'")

                padded_ids = []
                attention_masks = []
                for tokens in batch_token_ids:
                    num_to_pad = max_len - len(tokens)

                    if self.padding_side == "right":
                        padded_ids.append(tokens + [self.pad_token_id] * num_to_pad)
                        attention_masks.append([1] * len(tokens) + [0] * num_to_pad)
                    else:
                        padded_ids.append([self.pad_token_id] * num_to_pad + tokens)
                        attention_masks.append([0] * num_to_pad + [1] * len(tokens))

                batch_token_ids = padded_ids
            else:
                attention_masks = [[1] * len(tokens) for tokens in batch_token_ids]
        else:
            attention_masks = [[1] * len(tokens) for tokens in batch_token_ids]

        result = {
            "input_ids": batch_token_ids,
            "attention_mask": attention_masks,
        }

        if return_tensors == "np":
            result["input_ids"] = np.array(result["input_ids"], dtype=np.int64)
            result["attention_mask"] = np.array(result["attention_mask"], dtype=np.int64)

        if is_single_input and return_tensors is None:
            result["input_ids"] = result["input_ids"][0]
            result["attention_mask"] = result["attention_mask"][0]

        return result

[docs]    def batch_encode_plus(self, *args, **kwargs) -> dict[str, Any]:
        """Alias for `__call__` for Hugging Face compatibility."""
        return self.__call__(*args, **kwargs)

[docs]    def encode_plus(self, *args, **kwargs) -> dict[str, Any]:
        """Alias for `__call__` for Hugging Face compatibility."""
        return self.__call__(*args, **kwargs)

[docs]    @classmethod
    def from_hf_hub(cls, model_name: str = "mistralai/Mistral-Nemo-Instruct-2407"):
        """
        Creates an instance from a model name on the Hugging Face Hub.

        Args:
            model_name: The name of the Mistral model on the Hub.

        Returns:
            An instance of Mistral3Tokenizer.
        """
        if MistralTokenizer is None:
            raise ImportError("mistral-common is not installed. Please install it with `pip install mistral-common`.")
        mistral_tokenizer = MistralTokenizer.from_hf_hub(repo_id=model_name)
        return cls(mistral_tokenizer)