Skip to content

Tokenizers

Base Tokenizer

EncodeResult

Bases: TypedDict

Data container for tokenizer encoding results.

Source code in src/fed_rag/base/tokenizer.py
class EncodeResult(TypedDict):
    """Data container for tokenizer encoding results."""

    input_ids: list[int]
    attention_mask: list[int] | None

BaseTokenizer

Bases: BaseModel, ABC

Base Tokenizer Class.

This abstract class provides the interface for creating Tokenizer objects that converts strings into tokens.

Source code in src/fed_rag/base/tokenizer.py
class BaseTokenizer(BaseModel, ABC):
    """Base Tokenizer Class.

    This abstract class provides the interface for creating Tokenizer objects that
    converts strings into tokens.
    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    @abstractmethod
    def encode(self, input: str, **kwargs: Any) -> EncodeResult:
        """Encode the input string into list of integers.

        Args:
            input (str): The input string to be encoded.

        Returns:
            EncodeResult: The result of encoding.
        """

    @abstractmethod
    def decode(self, input_ids: list[int], **kwargs: Any) -> str:
        """Decode the input token ids into a string.

        Args:
            input_ids (list[int]): The token ids to be decoded back to text.

        Returns:
            str: The decoded text.
        """

    @property
    @abstractmethod
    def unwrapped(self) -> Any:
        """Return the underlying tokenizer if there is one."""

unwrapped abstractmethod property

unwrapped

Return the underlying tokenizer if there is one.

encode abstractmethod

encode(input, **kwargs)

Encode the input string into list of integers.

Parameters:

Name Type Description Default
input str

The input string to be encoded.

required

Returns:

Name Type Description
EncodeResult EncodeResult

The result of encoding.

Source code in src/fed_rag/base/tokenizer.py
@abstractmethod
def encode(self, input: str, **kwargs: Any) -> EncodeResult:
    """Encode the input string into list of integers.

    Args:
        input (str): The input string to be encoded.

    Returns:
        EncodeResult: The result of encoding.
    """

decode abstractmethod

decode(input_ids, **kwargs)

Decode the input token ids into a string.

Parameters:

Name Type Description Default
input_ids list[int]

The token ids to be decoded back to text.

required

Returns:

Name Type Description
str str

The decoded text.

Source code in src/fed_rag/base/tokenizer.py
@abstractmethod
def decode(self, input_ids: list[int], **kwargs: Any) -> str:
    """Decode the input token ids into a string.

    Args:
        input_ids (list[int]): The token ids to be decoded back to text.

    Returns:
        str: The decoded text.
    """