Skip to content

Tokenizers¶

Base Tokenizer

BaseTokenizer ¶

Bases: BaseModel, ABC

Base Tokenizer Class.

Source code in src/fed_rag/base/tokenizer.py
class BaseTokenizer(BaseModel, ABC):
    """Base Tokenizer Class."""

    model_config = ConfigDict(arbitrary_types_allowed=True)

    @abstractmethod
    def encode(self, input: str, **kwargs: dict) -> EncodeResult:
        """Encode the input string into list of integers."""

    @abstractmethod
    def decode(self, input_ids: str, **kwargs: dict) -> str:
        """Decode the input token ids into a string."""

    @property
    @abstractmethod
    def unwrapped(self) -> Any:
        """Return the underlying tokenizer if there is one."""

unwrapped abstractmethod property ¶

unwrapped

Return the underlying tokenizer if there is one.

encode abstractmethod ¶

encode(input, **kwargs)

Encode the input string into list of integers.

Source code in src/fed_rag/base/tokenizer.py
@abstractmethod
def encode(self, input: str, **kwargs: dict) -> EncodeResult:
    """Encode the input string into list of integers."""

decode abstractmethod ¶

decode(input_ids, **kwargs)

Decode the input token ids into a string.

Source code in src/fed_rag/base/tokenizer.py
@abstractmethod
def decode(self, input_ids: str, **kwargs: dict) -> str:
    """Decode the input token ids into a string."""