Source code for easydel.trainers.reward_trainer.reward_config

# Copyright 2025 The EasyDeL Author @erfanzar (Erfan Zare Chavoshi).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import field

from eformer.pytree import auto_pytree

from easydel.utils import Registry
from easydel.utils.compiling_utils import hash_fn

from ..training_configurations import TrainingArguments


[docs]@Registry.register("trainer-arguments", "reward")
@auto_pytree
class RewardConfig(TrainingArguments):
    """Configuration class for Reward Model training.

    Reward models are crucial components in RLHF pipelines, learning to predict
    human preferences between different model outputs. The trained reward model
    serves as a proxy for human judgment, providing feedback signals for
    policy optimization.

    This configuration extends TrainingArguments with parameters specific to
    training reward models using pairwise preference data. The model learns
    to assign higher scores to preferred (chosen) responses compared to
    non-preferred (rejected) responses.

    Key concepts:
    - Bradley-Terry model: P(chosen > rejected) = sigmoid(r_chosen - r_rejected)
    - Margin-based losses: Optionally enforce minimum score differences
    - Reward centering: Regularization to maintain mean-zero rewards

    Attributes:
        trainer_prefix (str | None): Prefix for trainer logs and checkpoints.
            Default: "rewardtrainer"
        max_sequence_length (int | None): Maximum length of sequences (prompt + completion).
            Sequences exceeding this limit are filtered out. Default: 1024
        disable_dropout (bool): Whether to disable dropout during training for
            more deterministic behavior. Recommended for reward models. Default: True
        dataset_num_proc (int | None): Number of processes for parallel dataset
            preprocessing. None uses sequential processing. Default: None
        center_rewards_coefficient (float | None): Coefficient for reward centering
            regularization. Encourages the model to output mean-zero rewards,
            preventing reward drift. Default: 0.1
        remove_unused_columns (bool | None): Whether to remove columns not used
            by the model's forward pass. Only set True if dataset is pretokenized.
            Default: False

    Example:
        >>> config = RewardConfig(
        ...     max_sequence_length=2048,
        ...     center_rewards_coefficient=0.01,
        ...     learning_rate=2e-5,
        ...     num_train_epochs=1
        ... )

    Note:
        The reward model typically uses the same architecture as the base LLM
        but with a scalar reward head instead of the language modeling head.
        Training requires paired preference data with chosen and rejected examples.
    """

    trainer_prefix: str | None = field(
        default="rewardtrainer",
        metadata={"help": "default prefix name for trainer."},
    )
    max_sequence_length: int | None = field(
        default=1024,
        metadata={
            "help": "Maximum length of the sequences (prompt + completion) in the batch, "
            "filters out entries that exceed the limit."
        },
    )
    disable_dropout: bool = field(
        default=True,
        metadata={"help": "Whether to disable dropout in the model."},
    )
    dataset_num_proc: int | None = field(
        default=None,
        metadata={"help": "Number of processes to use for processing the dataset."},
    )
    center_rewards_coefficient: float | None = field(
        default=0.1,
        metadata={"help": "Coefficient to incentivize the reward model to output mean-zero rewards."},
    )
    remove_unused_columns: bool = field(
        default=False,
        metadata={
            "help": "Whether to remove the columns that are not used by the model's forward pass. Can be `True` "
            "only if the dataset is pretokenized."
        },
    )

    __hash__ = hash_fn