Source code for fl4health.feature_alignment.handle_types

"""Largely taken from https://github.com/VectorInstitute/cyclops."""

from typing import Any

import numpy as np
import pandas as pd
from pandas.api.types import (
    is_bool_dtype,
    is_integer_dtype,
    is_numeric_dtype,
)

from fl4health.feature_alignment.constants import (
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FeatureType,
)


def _to_string(series: pd.Series) -> tuple[pd.Series, dict[str, Any]]:
    """
    Convert the features to string.

    Args:
        series (pd.Series): Feature data.

    Returns:
        tuple[pd.Series, dict[str, Any]]: Tuple (pandas.Series, dict) with the updated feature data
        and metadata respectively.
    """
    convertible_to_type(series, FeatureType.STRING, unique=None, raise_error=True)
    return to_dtype(series, FeatureType.STRING), {FEATURE_TYPE_ATTR: FeatureType.STRING}


def _convertible_to_categorical_indicators(
    series: pd.Series,
    unique: np.ndarray | None = None,
    category_max: int = 20,
    raise_error_over_max: bool = False,
) -> bool:
    """
    Check whether a feature can be converted to categorical indicators.

    Args:
        series (pd.Series): Feature data.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.
        category_max (int, optional): Maximum number of categories. Defaults to 20.
        raise_error_over_max (bool, optional): Whether to raise an error if categories exceeds max. Defaults to False.

    Returns:
        bool:  Whether the feature can be converted.
    """
    return _convertible_to_categorical(
        series,
        category_min=2,
        category_max=category_max,
        unique=unique,
        raise_error_over_max=raise_error_over_max,
    )


def _to_categorical_indicators(
    data: pd.DataFrame, col: str, unique: np.ndarray | None = None
) -> tuple[pd.DataFrame, dict[str, Any]]:
    """
    Convert the features to binary categorical indicators.

    This performs the Pandas equivalent of one-hot encoding.

    Args:
        data (pd.DataFrame): Features data.
        col (str): Feature column being converted.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.

    Raises:
        ValueError: Error if here are column duplicates.

    Returns:
        tuple[pd.DataFrame, dict[str, Any]]: Tuple (pandas.DataFrame, dict) with the updated features data
        and metadata respectively.
    """
    series = data[col]
    unique = get_unique(series, unique=unique)
    dummies = pd.get_dummies(series, prefix=str(series.name))

    meta = {}
    for dummy_col in dummies.columns:
        dummies[dummy_col] = to_dtype(dummies[dummy_col], FeatureType.CATEGORICAL_INDICATOR)
        meta[dummy_col] = {
            FEATURE_TYPE_ATTR: FeatureType.CATEGORICAL_INDICATOR,
            FEATURE_INDICATOR_ATTR: col,
        }

    intersect = set(dummies.columns).intersection(data.columns)
    if len(intersect) > 0:
        raise ValueError(f"Cannot duplicate columns {', '.join(intersect)}.")

    data = pd.concat([data, dummies], axis=1)
    data = data.drop([col], axis=1)

    return data, meta


def _convertible_to_ordinal(
    series: pd.Series,
    unique: np.ndarray | None = None,
    category_max: int = 20,
    raise_error_over_max: bool = False,
) -> bool:
    """
    Check whether a feature can be converted to type ordinal.

    Args:
        series (pd.Series): Feature data.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.
        category_max (int, optional): The number of categories above which the feature is not considered ordinal.
            Defaults to 20.
        raise_error_over_max (bool, optional): Whether to raise an error if there are more categories than max.
            Defaults to False.

    Returns:
        bool: Whether the feature can be converted.
    """
    return _convertible_to_categorical(
        series,
        category_min=2,
        category_max=category_max,
        unique=unique,
        raise_error_over_max=raise_error_over_max,
    )


def _to_ordinal(series: pd.Series, unique: np.ndarray | None = None) -> tuple[pd.Series, dict[str, Any]]:
    """
    Convert the features to ordinal.

    Args:
        series (pd.Series): Feature data.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.

    Returns:
        tuple[pd.Series, dict[str, Any]]: Tuple (pandas.Series, dict) with the updated feature data
        and metadata respectively.
    """
    series, meta = _numeric_categorical_mapping(series, unique=unique)
    meta[FEATURE_TYPE_ATTR] = FeatureType.ORDINAL
    return to_dtype(series, FeatureType.ORDINAL), meta


def _numeric_categorical_mapping(
    series: pd.Series, unique: np.ndarray | None = None
) -> tuple[pd.Series, dict[str, Any]]:
    """
    Map values to categories in a series.

    Args:
        series (pd.Series): Feature data.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.

    Returns:
        tuple[pd.Series, dict[str, Any]]: Tuple (pandas.Series, dict) with the updated feature data and metadata
        respectively.
    """
    unique = get_unique(series, unique=unique)
    if unique.dtype.name == "object":
        unique = unique.astype(str)

    unique.sort()

    map_dict: dict[Any, int] = {}
    for i, unique_val in enumerate(unique):
        map_dict[unique_val] = i

    series = series.map(map_dict)

    inv_map = {v: k for k, v in map_dict.items()}
    meta = {FEATURE_MAPPING_ATTR: inv_map}

    return series, meta


def _convertible_to_binary(series: pd.Series, unique: np.ndarray | None = None) -> bool:
    """
    Check whether a feature can be converted to type binary.

    Args:
        series (pd.Series): Feature data.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.

    Returns:
        bool: Whether the feature can be converted.
    """
    if is_bool_dtype(series):
        return True

    return _convertible_to_categorical(
        series,
        category_min=2,
        category_max=2,
        unique=unique,
    )


def _to_binary(series: pd.Series, unique: np.ndarray | None = None) -> tuple[pd.Series, dict[str, Any]]:
    """
    Convert the features to binary.

    Args:
        series (pd.Series): Feature data.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.

    Returns:
        tuple[pd.Series, dict[str, Any]]: Tuple (pandas.Series, dict) with the updated feature data and metadata
        respectively.
    """
    if is_bool_dtype(series):
        meta = {
            FEATURE_TYPE_ATTR: FeatureType.BINARY,
            FEATURE_MAPPING_ATTR: {False: False, True: True},
        }
        return to_dtype(series, FeatureType.BINARY), meta

    series, meta = _numeric_categorical_mapping(series, unique=unique)
    meta[FEATURE_TYPE_ATTR] = FeatureType.BINARY
    return to_dtype(series, FeatureType.BINARY), meta


def _convertible_to_numeric(series: pd.Series, raise_error: bool = False) -> bool:
    """
    Check whether a feature can be converted to type numeric.

    Args:
        series (pd.Series): Feature data.
        raise_error (bool, optional): Whether to raise an error if the type cannot be converted. Defaults to False.

    Returns:
        bool: Whether the feature can be converted.
    """
    if raise_error:
        pd.to_numeric(series)
        return True

    try:
        pd.to_numeric(series)
        can_convert = True
    except (ValueError, TypeError):
        can_convert = False

    return can_convert


def _to_numeric(series: pd.Series, unique: np.ndarray | None = None) -> tuple[pd.Series, dict[str, Any]]:
    """
    Convert the features to numeric.

    Args:
        series (pd.Series): Feature data.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.

    Returns:
        tuple[pd.Series, dict[str, Any]]: Tuple (pandas.Series, dict) with the updated feature data and metadata
        respectively.
    """
    convertible_to_type(series, FeatureType.NUMERIC, unique=unique, raise_error=True)
    series = pd.to_numeric(series)
    return to_dtype(series, FeatureType.NUMERIC), {FEATURE_TYPE_ATTR: FeatureType.NUMERIC}


def _convertible_to_categorical(
    series: pd.Series,
    category_min: int | None = None,
    category_max: int | None = None,
    unique: np.ndarray | None = None,
    raise_error_over_max: bool = False,
    raise_error_under_min: bool = False,
) -> bool:
    """
    Check whether a feature can be converted to some categorical type.

    Args:
        series (pd.Series): Feature data.
        category_min (int | None, optional):  The minimum number of categories allowed. Defaults to None.
        category_max (int | None, optional): The maximum number of categories allowed. Defaults to None.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.
        raise_error_over_max (bool, optional):  Whether to raise an error if there are more categories than max.
            Defaults to False.
        raise_error_under_min (bool, optional): Whether to raise an error if there are less categories than min.
            Defaults to False.

    Raises:
        ValueError: Raise an error if there are more categories than max and ``raise_error_over_max`` is True
        ValueError: Raise an error if there are less categories than min and ``raise_error_under_min`` is True

    Returns:
        bool: Whether the feature can be converted.
    """
    # If numeric, only allow conversion if an integer type
    if is_numeric_dtype(series) and not is_integer_dtype(series):
        return False

    unique = get_unique(series, unique=unique)
    nonnull_unique = unique[~pd.isnull(unique)]
    nunique = len(nonnull_unique)

    satisfies_minimum_condition = True if category_min is None else nunique >= category_min

    satisfies_maximum_condition = True if category_max is None else nunique <= category_max

    # Convertible
    if satisfies_minimum_condition and satisfies_maximum_condition:
        return True

    # Not convertible
    if (not satisfies_maximum_condition) and raise_error_over_max:
        raise ValueError(
            f"Should have at most {category_max} categories, but has {nunique}.",
        )

    if (not satisfies_minimum_condition) and raise_error_under_min:
        raise ValueError(
            f"Should have at least {category_min} categories, but has {nunique}.",
        )

    return False



[docs]
def convertible_to_type(
    series: pd.Series, type: FeatureType, unique: np.ndarray | None = None, raise_error: bool = False
) -> bool:
    """
    Check whether a feature can be converted to some type.

    Args:
        series (pd.Series): Feature data.
        type (FeatureType): Feature type name to check for conversion.
        unique (np.ndarray | None, optional): _description_. Defaults to None.
        raise_error (bool, optional): Unique values which can be optionally specified. Defaults to False.

    Raises:
        ValueError: Supported type has no corresponding datatype
        ValueError: Cannot convert series to the provided type and ``raise_error`` is true.

    Returns:
        bool: Whether the feature can be converted.
    """
    if type == FeatureType.NUMERIC:
        convertible = _convertible_to_numeric(series)

    elif type == FeatureType.STRING:
        convertible = True

    elif type == FeatureType.BINARY:
        convertible = _convertible_to_binary(series, unique=unique)

    elif type == FeatureType.ORDINAL:
        convertible = _convertible_to_ordinal(series, unique=unique)

    elif type == FeatureType.CATEGORICAL_INDICATOR:
        convertible = _convertible_to_categorical_indicators(series, unique=unique)

    elif valid_feature_type(type, raise_error=True):
        # Check first if the type is valid, if so, then it isn't supported here.
        raise ValueError("Supported type has no corresponding datatype.")

    if raise_error and not convertible:
        raise ValueError(f"Cannot convert series {series.name} to type {type}.")

    return convertible




[docs]
def get_unique(values: np.ndarray | pd.Series, unique: np.ndarray | None = None) -> np.ndarray:
    """
    Get the unique values of pandas series.

    The utility of this function comes from checking whether the unique values have already been calculated. This
    function assumes that if the unique values are passed, they are correct.

    Args:
        values (np.ndarray | pd.Series): Values for which to get the unique values.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.

    Returns:
        np.ndarray: The unique values.
    """
    if unique is None:
        return np.array(values.unique())  # type: ignore

    return unique




[docs]
def valid_feature_type(type: FeatureType, raise_error: bool = True) -> bool:
    """
    Check whether a feature type name is valid.

    Args:
        type (FeatureType): Feature type name.
        raise_error (bool, optional): Whether to raise an error is the type is invalid. Defaults to True.

    Raises:
        ValueError: Raise when the type is invalid and ``raise_error`` is True

    Returns:
        bool: Whether the type is valid.
    """
    if type in FEATURE_TYPES:
        return True

    if raise_error:
        all_feature_types = ", ".join([types.value for types in FEATURE_TYPES])
        raise ValueError(f"Feature type '{type.value}' not in {all_feature_types}.")

    return False



def _type_to_dtype(type: FeatureType) -> str | None:
    """
    Get the Pandas datatype for a feature type name.

    Args:
        type (FeatureType.): Feature type name.

    Raises:
        ValueError: Supported type has no corresponding datatype.

    Returns:
        str | None: The feature's Pandas datatype, or None if no data type conversion is desired.
    """
    if type == FeatureType.STRING:
        # If string, leave as is - the user can choose the specific length/type.
        return None

    if type == FeatureType.NUMERIC:
        # If numeric, leave as is - the user can choose the precision.
        return None

    if type in (FeatureType.BINARY, FeatureType.CATEGORICAL_INDICATOR, FeatureType.ORDINAL):
        return "category"

    # Check first if the type is valid, if so, then it isn't supported in this function.
    if valid_feature_type(type, raise_error=True):
        raise ValueError("Supported type has no corresponding datatype.")

    return None



[docs]
def to_dtype(series: pd.Series, type: FeatureType) -> pd.Series:
    """
    Set the series datatype according to the feature type.

    Args:
        series (pd.Series): Feature data.
        type (FeatureType): Feature type name.

    Returns:
        pd.Series: The feature with the corresponding datatype.
    """
    dtype = _type_to_dtype(type)

    if dtype is None:
        return series

    if series.dtype == dtype:
        return series

    return series.astype(dtype)  # type: ignore



def _infer_type(series: pd.Series, unique: np.ndarray | None = None) -> FeatureType:
    """
    Infer intended feature type and perform the relevant conversion.

    Args:
        series (pd.Series): Feature data.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.

    Raises:
        ValueError: Could not infer type of series

    Returns:
        str: Feature type name.
    """
    unique = get_unique(series, unique=unique)

    if convertible_to_type(series, FeatureType.BINARY, unique=unique):
        return FeatureType.BINARY

    if convertible_to_type(series, FeatureType.ORDINAL, unique=unique):
        return FeatureType.ORDINAL

    if convertible_to_type(series, FeatureType.NUMERIC, unique=unique):
        return FeatureType.NUMERIC

    if convertible_to_type(series, FeatureType.STRING, unique=unique):
        return FeatureType.STRING

    raise ValueError(f"Could not infer type of series '{series.name}'.")


def _to_type(
    data: pd.DataFrame, col: str, new_type: FeatureType, unique: np.ndarray | None = None
) -> tuple[pd.DataFrame, dict[str, Any]]:
    """
    Convert a feature to a given type.

    Args:
        data (pd.DataFrame): Features data.
        col (str): Column name for the feature being converted.
        new_type (FeatureType): Feature type name of type to which to convert.
        unique (np.ndarray | None, optional): Unique values which can be optionally specified. Defaults to None.

    Raises:
        ValueError: The features data must be passed to keyword argument 'data'.
        ValueError: Cannot convert to the new type.

    Returns:
        tuple[pd.Series | pd.DataFrame, dict[str, Any]]: Tuple (pandas.Series or pandas.DataFrame, dict) with the
        updated features data and metadata respectively. If converting to categorical indicators, a DataFrame is
        returned, otherwise a Series is returned.
    """
    if data is None:
        raise ValueError(
            "The features data must be passed to keyword argument 'data'.",
        )

    if new_type == FeatureType.CATEGORICAL_INDICATOR:
        return _to_categorical_indicators(data, col, unique=unique)

    if new_type == FeatureType.STRING:
        series, meta = _to_string(data[col])

    elif new_type == FeatureType.ORDINAL:
        series, meta = _to_ordinal(data[col], unique=unique)

    elif new_type == FeatureType.BINARY:
        series, meta = _to_binary(data[col], unique=unique)

    elif new_type == FeatureType.NUMERIC:
        series, meta = _to_numeric(data[col], unique=unique)

    elif valid_feature_type(new_type, raise_error=True):
        # Check if an incorrect type was passed, otherwise
        # say that it isn't supported.
        raise ValueError(f"Cannot convert to type {new_type}.")

    data[col] = series
    meta = {str(series.name): meta}
    return data, meta



[docs]
def infer_types(data: pd.DataFrame, features: list[str]) -> dict[str, FeatureType]:
    """
    Infer intended feature types and perform the relevant conversions.

    Args:
        data (pd.DataFrame): Feature data.
        features (list[str]): Features to consider.

    Returns:
        dict[str, str]: A tuple (pandas.DataFrame, dict) with the updated features data and metadata respectively.
    """
    new_types = {}
    for col in features:
        new_types[col] = _infer_type(data[col])

    return new_types




[docs]
def to_types(data: pd.DataFrame, new_types: dict[str, FeatureType]) -> tuple[pd.DataFrame, dict[str, Any]]:
    """
    Convert features to given types.

    Args:
        data (pd.DataFrame): Features data.
        new_types (dict[str, str]): Map from the feature column name to its new type.

    Returns:
        tuple[pd.DataFrame, dict[str, Any]]: Tuple (pandas.DataFrame, dict) with the updated features data and
        metadata respectively.
    """
    meta = {}
    for col, new_type in new_types.items():
        data, fmeta = _to_type(data, col, new_type)
        meta.update(fmeta)

    return data, meta