from __future__ import annotations
import json
import pandas as pd
from cyclops.data.df.feature import TabularFeatures
from flwr.common.typing import Scalar
from sklearn.feature_extraction.text import CountVectorizer
from fl4health.feature_alignment.tabular_feature import MetaData, TabularFeature
from fl4health.feature_alignment.tabular_type import TabularType
[docs]
class TabularFeaturesInfoEncoder:
"""
This class encodes all the information required to perform feature
alignment on tabular datasets.
Args:
tabular_features (list[TabularFeature]): List of all tabular features.
tabular_targets (list[TabularFeature]): List of all targets.
(Note: targets are not included in tabular_features)
"""
def __init__(self, tabular_features: list[TabularFeature], tabular_targets: list[TabularFeature]) -> None:
self.tabular_features = sorted(tabular_features, key=TabularFeature.get_feature_name)
self.tabular_targets = sorted(tabular_targets, key=TabularFeature.get_feature_name)
[docs]
def get_tabular_features(self) -> list[TabularFeature]:
return self.tabular_features
[docs]
def get_tabular_targets(self) -> list[TabularFeature]:
return self.tabular_targets
[docs]
def get_feature_columns(self) -> list[str]:
return sorted([feature.get_feature_name() for feature in self.tabular_features])
[docs]
def get_target_columns(self) -> list[str]:
return sorted([target.get_feature_name() for target in self.tabular_targets])
[docs]
def features_by_type(self, tabular_type: TabularType) -> list[TabularFeature]:
return sorted(
[feature for feature in self.tabular_features if feature.get_feature_type() == tabular_type],
key=TabularFeature.get_feature_name,
)
[docs]
def type_to_features(self) -> dict[TabularType, list[TabularFeature]]:
return {tabular_type: self.features_by_type(tabular_type) for tabular_type in TabularType}
[docs]
def get_categories_list(self) -> list[MetaData]:
return [cat_feature.get_metadata() for cat_feature in self.features_by_type(TabularType.ORDINAL)]
[docs]
def get_target_dimension(self) -> int:
# Return the dimension of the target array *after* feature alignment is performed.
dimension = 0
for target in self.tabular_targets:
dimension += target.get_metadata_dimension()
return dimension
@staticmethod
def _construct_tab_feature(
df: pd.DataFrame,
feature_name: str,
feature_type: TabularType,
fill_values: dict[str, Scalar] | None,
) -> TabularFeature:
if fill_values is None or feature_name not in fill_values:
fill_value = TabularType.get_default_fill_value(feature_type)
else:
fill_value = fill_values[feature_name]
if feature_type == TabularType.ORDINAL or feature_type == TabularType.BINARY:
# Extract categories information.
feature_categories = sorted(df[feature_name].unique().tolist())
return TabularFeature(feature_name, feature_type, fill_value, feature_categories)
elif feature_type == TabularType.STRING:
# Extract vocabulary from a string column of df.
count_vectorizer = CountVectorizer()
count_vectorizer.fit(df[feature_name])
vocabulary = count_vectorizer.vocabulary_
return TabularFeature(feature_name, feature_type, fill_value, vocabulary)
else:
return TabularFeature(feature_name, feature_type, fill_value)
[docs]
@staticmethod
def encoder_from_dataframe(
df: pd.DataFrame,
id_column: str,
target_columns: str | list[str],
fill_values: dict[str, Scalar] | None = None,
) -> TabularFeaturesInfoEncoder:
features_list = sorted(df.columns.values.tolist())
features_list.remove(id_column)
# Leverage cyclops to perform type inference
tab_features = TabularFeatures(
data=df.reset_index(), features=features_list, by=id_column, targets=target_columns
)
features_to_types = tab_features.types
tabular_targets = []
tabular_features = []
# Construct TabularFeature objects.
for feature_name in features_to_types:
feature_type = TabularType(features_to_types[feature_name])
tabular_feature = TabularFeaturesInfoEncoder._construct_tab_feature(
df, feature_name, feature_type, fill_values
)
if feature_name == target_columns or feature_name in target_columns:
tabular_targets.append(tabular_feature)
else:
tabular_features.append(tabular_feature)
return TabularFeaturesInfoEncoder(tabular_features, tabular_targets)
[docs]
def to_json(self) -> str:
return json.dumps(
{
"tabular_features": json.dumps([tab_feature.to_json() for tab_feature in self.tabular_features]),
"tabular_targets": json.dumps([tab_target.to_json() for tab_target in self.tabular_targets]),
}
)
[docs]
@staticmethod
def from_json(json_str: str) -> TabularFeaturesInfoEncoder:
attributes = json.loads(json_str)
return TabularFeaturesInfoEncoder(
[TabularFeature.from_json(tab_str) for tab_str in json.loads(attributes["tabular_features"])],
[TabularFeature.from_json(target_str) for target_str in json.loads(attributes["tabular_targets"])],
)