Skip to content

Huggingface

HuggingFace Data Collator For LM-Supervised Retriever Training

DataCollatorForLSR

Bases: SentenceTransformerDataCollator, BaseDataCollator

A HuggingFace DataCollator for LM-Supervised Retrieval.

Source code in src/fed_rag/data_collators/huggingface/lsr.py
class DataCollatorForLSR(SentenceTransformerDataCollator, BaseDataCollator):
    """A HuggingFace DataCollator for LM-Supervised Retrieval."""

    prompt_template: str = Field(default=DEFAULT_PROMPT_TEMPLATE)
    target_template: str = Field(default=DEFAULT_TARGET_TEMPLATE)
    default_return_tensors: str = Field(default="pt")

    # Add these fields to make Pydantic aware of them
    tokenize_fn: Callable = Field(
        default_factory=lambda: (lambda *args, **kwargs: {})
    )
    valid_label_columns: list[str] = Field(
        default_factory=lambda: ["label", "score"]
    )
    _warned_columns: set = PrivateAttr(
        default_factory=set
    )  # exclude=True to match dataclass repr=False

    def __init__(
        self,
        rag_system: RAGSystem,
        prompt_template: str | None = None,
        target_template: str | None = None,
        default_return_tensors: str = "pt",
        **kwargs: Any,
    ):
        if not _has_huggingface:
            msg = (
                f"`{self.__class__.__name__}` requires `huggingface` extra to be installed. "
                "To fix please run `pip install fed-rag[huggingface]`."
            )
            raise MissingExtraError(msg)

        _validate_rag_system(rag_system)

        prompt_template = prompt_template or DEFAULT_PROMPT_TEMPLATE
        target_template = target_template or DEFAULT_TARGET_TEMPLATE

        # init pydantic base model
        BaseDataCollator.__init__(
            self,
            rag_system=rag_system,
            prompt_template=prompt_template,
            target_template=target_template,
            default_return_tensors=default_return_tensors,
            tokenize_fn=lambda *args, **kwargs: {},  # Pass this to Pydantic
            **kwargs,
        )

    def __call__(
        self, features: list[dict[str, Any]], return_tensors: str | None = None
    ) -> dict[str, Any]:
        """Use the features of the dataset in order to get the retrieval and lm-scores.


        Args:
            features (list[Any]): Should contain a 'query' and 'reponse' field.
            return_tensors (_type_, optional): supports right now only 'pt'

        Returns:
            dict[str, Any]: a dictionary of ~torch.Tensors with keys 'retrieval_scores'
                and 'lm_scores'
            Note that each ('query', 'response') pair generates one fine-tuning instance for LSR.
        """
        return_tensors = (
            return_tensors if return_tensors else self.default_return_tensors
        )
        if return_tensors != "pt":
            raise FedRAGError(f"Framework '{return_tensors}' not recognized!")

        # use rag system to get scores
        batch_retriever_scores = []
        batch_lm_scores = []
        for example in features:
            query = example.get("query")
            response = example.get("response")

            # retriever scores - this should participate in gradient computation
            source_nodes = self.rag_system.retrieve(query)
            retriever_scores = torch.tensor(
                [n.score for n in source_nodes], requires_grad=True
            )

            # lm supervised scores - we don't want these to participate in gradient computation
            lm_scores = []
            with torch.no_grad():
                for chunk in source_nodes:
                    prompt = self.prompt_template.format(
                        query=query,
                        context=chunk.node.get_content()["text_content"],
                    )
                    target = self.target_template.format(response=response)
                    lm_score = self.rag_system.generator.compute_target_sequence_proba(
                        prompt=prompt, target=target
                    )
                    lm_scores.append(lm_score)
                lm_scores = torch.stack(lm_scores, dim=0)

            # append to batch
            batch_retriever_scores.append(retriever_scores)
            batch_lm_scores.append(lm_scores)

        # create torch.Tensors
        retrieval_scores = torch.stack(batch_retriever_scores, dim=0)
        lm_scores = torch.stack(batch_lm_scores, dim=0)

        return {"retrieval_scores": retrieval_scores, "lm_scores": lm_scores}

__call__

__call__(features, return_tensors=None)

Use the features of the dataset in order to get the retrieval and lm-scores.

Parameters:

Name Type Description Default
features list[Any]

Should contain a 'query' and 'reponse' field.

required
return_tensors _type_

supports right now only 'pt'

None

Returns:

Type Description
dict[str, Any]

dict[str, Any]: a dictionary of ~torch.Tensors with keys 'retrieval_scores' and 'lm_scores'

dict[str, Any]

Note that each ('query', 'response') pair generates one fine-tuning instance for LSR.

Source code in src/fed_rag/data_collators/huggingface/lsr.py
def __call__(
    self, features: list[dict[str, Any]], return_tensors: str | None = None
) -> dict[str, Any]:
    """Use the features of the dataset in order to get the retrieval and lm-scores.


    Args:
        features (list[Any]): Should contain a 'query' and 'reponse' field.
        return_tensors (_type_, optional): supports right now only 'pt'

    Returns:
        dict[str, Any]: a dictionary of ~torch.Tensors with keys 'retrieval_scores'
            and 'lm_scores'
        Note that each ('query', 'response') pair generates one fine-tuning instance for LSR.
    """
    return_tensors = (
        return_tensors if return_tensors else self.default_return_tensors
    )
    if return_tensors != "pt":
        raise FedRAGError(f"Framework '{return_tensors}' not recognized!")

    # use rag system to get scores
    batch_retriever_scores = []
    batch_lm_scores = []
    for example in features:
        query = example.get("query")
        response = example.get("response")

        # retriever scores - this should participate in gradient computation
        source_nodes = self.rag_system.retrieve(query)
        retriever_scores = torch.tensor(
            [n.score for n in source_nodes], requires_grad=True
        )

        # lm supervised scores - we don't want these to participate in gradient computation
        lm_scores = []
        with torch.no_grad():
            for chunk in source_nodes:
                prompt = self.prompt_template.format(
                    query=query,
                    context=chunk.node.get_content()["text_content"],
                )
                target = self.target_template.format(response=response)
                lm_score = self.rag_system.generator.compute_target_sequence_proba(
                    prompt=prompt, target=target
                )
                lm_scores.append(lm_score)
            lm_scores = torch.stack(lm_scores, dim=0)

        # append to batch
        batch_retriever_scores.append(retriever_scores)
        batch_lm_scores.append(lm_scores)

    # create torch.Tensors
    retrieval_scores = torch.stack(batch_retriever_scores, dim=0)
    lm_scores = torch.stack(batch_lm_scores, dim=0)

    return {"retrieval_scores": retrieval_scores, "lm_scores": lm_scores}