Source code for fl4health.feature_alignment.tab_features_info_encoder

from __future__ import annotations

import json

import pandas as pd
from flwr.common.typing import Scalar
from sklearn.feature_extraction.text import CountVectorizer

from fl4health.feature_alignment.feature_type_extraction import TabularFeatures
from fl4health.feature_alignment.tabular_feature import MetaData, TabularFeature
from fl4health.feature_alignment.tabular_type import TabularType



[docs]
class TabularFeaturesInfoEncoder:

[docs]
    def __init__(self, tabular_features: list[TabularFeature], tabular_targets: list[TabularFeature]) -> None:
        """
        This class encodes all the information required to perform feature alignment on tabular datasets.

        **NOTE**: targets are not included in tabular_features

        Args:
            tabular_features (list[TabularFeature]): List of all tabular features.
            tabular_targets (list[TabularFeature]): List of all targets.
        """
        self.tabular_features = sorted(tabular_features, key=TabularFeature.get_feature_name)
        self.tabular_targets = sorted(tabular_targets, key=TabularFeature.get_feature_name)



[docs]
    def get_tabular_features(self) -> list[TabularFeature]:
        return self.tabular_features



[docs]
    def get_tabular_targets(self) -> list[TabularFeature]:
        return self.tabular_targets



[docs]
    def get_feature_columns(self) -> list[str]:
        return sorted([feature.get_feature_name() for feature in self.tabular_features])



[docs]
    def get_target_columns(self) -> list[str]:
        return sorted([target.get_feature_name() for target in self.tabular_targets])



[docs]
    def features_by_type(self, tabular_type: TabularType) -> list[TabularFeature]:
        return sorted(
            [feature for feature in self.tabular_features if feature.get_feature_type() == tabular_type],
            key=TabularFeature.get_feature_name,
        )



[docs]
    def type_to_features(self) -> dict[TabularType, list[TabularFeature]]:
        return {tabular_type: self.features_by_type(tabular_type) for tabular_type in TabularType}



[docs]
    def get_categories_list(self) -> list[MetaData]:
        return [cat_feature.get_metadata() for cat_feature in self.features_by_type(TabularType.ORDINAL)]



[docs]
    def get_target_dimension(self) -> int:
        # Return the dimension of the target array *after* feature alignment is performed.
        dimension = 0
        for target in self.tabular_targets:
            dimension += target.get_metadata_dimension()
        return dimension


    @staticmethod
    def _construct_tab_feature(
        df: pd.DataFrame,
        feature_name: str,
        feature_type: TabularType,
        fill_values: dict[str, Scalar] | None,
    ) -> TabularFeature:
        if fill_values is None or feature_name not in fill_values:
            fill_value = TabularType.get_default_fill_value(feature_type)
        else:
            fill_value = fill_values[feature_name]

        if feature_type in {TabularType.ORDINAL, TabularType.BINARY}:
            # Extract categories information.
            feature_categories = sorted(df[feature_name].unique().tolist())
            return TabularFeature(feature_name, feature_type, fill_value, feature_categories)
        if feature_type == TabularType.STRING:
            # Extract vocabulary from a string column of df.
            count_vectorizer = CountVectorizer()
            count_vectorizer.fit(df[feature_name])
            vocabulary = count_vectorizer.vocabulary_
            return TabularFeature(feature_name, feature_type, fill_value, vocabulary)
        return TabularFeature(feature_name, feature_type, fill_value)


[docs]
    @staticmethod
    def encoder_from_dataframe(
        df: pd.DataFrame,
        id_column: str,
        target_columns: str | list[str],
        fill_values: dict[str, Scalar] | None = None,
    ) -> TabularFeaturesInfoEncoder:
        features_list = sorted(df.columns.values.tolist())
        features_list.remove(id_column)
        tab_features = TabularFeatures(
            data=df.reset_index(), features=features_list, by=id_column, targets=target_columns
        )
        features_to_types = tab_features.types

        tabular_targets = []
        tabular_features = []
        # Construct TabularFeature objects.
        for feature_name in features_to_types:
            feature_type = TabularType(features_to_types[feature_name].value)
            tabular_feature = TabularFeaturesInfoEncoder._construct_tab_feature(
                df, feature_name, feature_type, fill_values
            )
            if feature_name == target_columns or feature_name in target_columns:
                tabular_targets.append(tabular_feature)
            else:
                tabular_features.append(tabular_feature)
        return TabularFeaturesInfoEncoder(tabular_features, tabular_targets)



[docs]
    def to_json(self) -> str:
        return json.dumps(
            {
                "tabular_features": json.dumps([tab_feature.to_json() for tab_feature in self.tabular_features]),
                "tabular_targets": json.dumps([tab_target.to_json() for tab_target in self.tabular_targets]),
            }
        )



[docs]
    @staticmethod
    def from_json(json_str: str) -> TabularFeaturesInfoEncoder:
        attributes = json.loads(json_str)
        return TabularFeaturesInfoEncoder(
            [TabularFeature.from_json(tab_str) for tab_str in json.loads(attributes["tabular_features"])],
            [TabularFeature.from_json(target_str) for target_str in json.loads(attributes["tabular_targets"])],
        )