Source code for fl4health.datasets.skin_cancer.preprocess_skin

"""
The following code is adapted from the preprocess_skin.py script
from the medical_federated GitHub repository by Seongjun Yang et al.

Paper: https://arxiv.org/abs/2207.03075
Code: https://github.com/wns823/medical_federated.git
- medical_federated/skin_cancer_federated/preprocess_skin.py
"""

import json
import os
from collections.abc import Callable
from typing import Any

import pandas as pd


[docs] def save_to_json(data: dict[str, Any], path: str) -> None: """ Saves a dictionary to a JSON file. Args: data: A dictionary to save. path: The file path to save the JSON data. """ with open(path, "w", encoding="utf-8") as file: json.dump(data, file, indent="\t")
[docs] def process_client_data( dataframe: pd.DataFrame, client_name: str, data_path: str, image_path_func: Callable[[pd.Series], str], label_map_func: Callable[[pd.Series], str], original_columns: list[str], official_columns: list[str], ) -> None: """ Processes and saves the client-specific dataset. Args: dataframe: The dataframe containing the client data. client_name: The name of the client. data_path: The base path to the dataset. image_path_func: A function that constructs the image path from a dataframe row. label_map_func: A function that maps the original label to the new label. original_columns: The list of original columns for the dataset. official_columns: The list of official columns for the dataset. """ preprocessed_data: dict[str, Any] = { "columns": official_columns, "original_columns": original_columns, "data": [], } for i in range(len(dataframe)): row = dataframe.iloc[i] img_path = image_path_func(row) label = label_map_func(row) origin_labels = [0] * len(original_columns) extended_labels = [0] * len(official_columns) origin_labels[original_columns.index(label)] = 1 extended_labels[official_columns.index(label)] = 1 preprocessed_data["data"].append( { "img_path": img_path, "origin_labels": origin_labels, "extended_labels": extended_labels, } ) save_to_json(preprocessed_data, os.path.join(data_path, f"{client_name}.json"))
[docs] def preprocess_isic_2019(data_path: str, official_columns: list[str]) -> None: """ Preprocesses the ISIC 2019 dataset. Args: data_path (str): The base path to the dataset. official_columns (list[str]): The list of official columns for the dataset. """ isic_2019_path = os.path.join(data_path, "ISIC_2019") isic_csv_path = os.path.join(isic_2019_path, "ISIC_2019_Training_GroundTruth.csv") isic_df = pd.read_csv(isic_csv_path) isic_meta = pd.read_csv(os.path.join(isic_2019_path, "ISIC_2019_Training_Metadata.csv")) barcelona_list = [i for i in isic_meta["lesion_id"].dropna() if "BCN" in i] barcelona_core = isic_meta[isic_meta["lesion_id"].isin(barcelona_list)] core_2019 = isic_df[isic_df["image"].isin(barcelona_core["image"])] core_2019.to_csv(os.path.join(isic_2019_path, "ISIC_2019_core.csv"), mode="w") isic_2019_data_path = os.path.join(data_path, "ISIC_2019", "ISIC_2019_Training_Input") barcelona_df = pd.read_csv(os.path.join(isic_2019_path, "ISIC_2019_core.csv")) barcelona_new = barcelona_df[["image"] + official_columns + ["UNK"]] preprocessed_data: dict[str, Any] = { "columns": official_columns, "original_columns": official_columns, "data": [], } for i in range(len(barcelona_new)): # Extract the row values, leaving off the last element ("UNK" column) temp = list(barcelona_new.loc[i].values[:-1]) img_path = os.path.join(isic_2019_data_path, temp[0] + ".jpg") origin_labels = temp[1:] extended_labels = temp[1:] preprocessed_data["data"].append( { "img_path": img_path, "origin_labels": origin_labels, "extended_labels": extended_labels, } ) save_to_json(preprocessed_data, os.path.join(data_path, "ISIC_2019", "ISIC_19_Barcelona.json"))
[docs] def ham_image_path_func(row: pd.Series) -> str: """ Constructs the image path for the HAM10000 dataset. Args: row (pd.Series): A row from the dataframe. Returns: str: The constructed image path. """ return os.path.join("fl4health", "datasets", "skin_cancer", "HAM10000", row["image_id"] + ".jpg")
[docs] def ham_label_map_func(row: pd.Series) -> str: """ Maps the original label to the new label for the HAM10000 dataset. Args: row (pd.Series): A row from the dataframe. Returns: str: The mapped label. """ ham_labelmap = { "akiec": "AK", "bcc": "BCC", "bkl": "BKL", "df": "DF", "mel": "MEL", "nv": "NV", "vasc": "VASC", } return ham_labelmap[row["dx"]]
[docs] def preprocess_ham10000(data_path: str, official_columns: list[str]) -> None: """ Preprocesses the HAM10000 dataset. Args: data_path (str): The base path to the dataset. official_columns (list[str]): The list of official columns for the dataset. """ ham_10000_path = os.path.join(data_path, "HAM10000") ham_csv_path = os.path.join(ham_10000_path, "HAM10000_metadata") ham_df = pd.read_csv(ham_csv_path) rosendahl_data = ham_df[ham_df["dataset"] == "rosendahl"] rosendahl_data.to_csv(os.path.join(ham_10000_path, "HAM_rosendahl.csv"), mode="w") vienna_data = ham_df[ham_df["dataset"] != "rosendahl"] vienna_data.to_csv(os.path.join(ham_10000_path, "HAM_vienna.csv"), mode="w") ham_columns = ["MEL", "NV", "BCC", "AK", "BKL", "DF", "VASC"] process_client_data( pd.read_csv(os.path.join(ham_10000_path, "HAM_rosendahl.csv")), "HAM_rosendahl", ham_10000_path, ham_image_path_func, ham_label_map_func, ham_columns, official_columns, ) process_client_data( pd.read_csv(os.path.join(ham_10000_path, "HAM_vienna.csv")), "HAM_vienna", ham_10000_path, ham_image_path_func, ham_label_map_func, ham_columns, official_columns, )
[docs] def pad_image_path_func(row: pd.Series) -> str: """ Constructs the image path for the PAD-UFES-20 dataset. Args: row (pd.Series): A row from the dataframe. Returns: str: The constructed image path. """ return os.path.join("fl4health", "datasets", "skin_cancer", "PAD-UFES-20", row["img_id"])
[docs] def pad_label_map_func(row: pd.Series) -> str: """ Maps the original label to the new label for the PAD-UFES-20 dataset. Args: row (pd.Series): A row from the dataframe. Returns: str: The mapped label. """ pad_ufes_20_labelmap = { "ACK": "AK", "BCC": "BCC", "MEL": "MEL", "NEV": "NV", "SCC": "SCC", "SEK": "BKL", } return pad_ufes_20_labelmap[row["diagnostic"]]
[docs] def preprocess_pad_ufes_20(data_path: str, official_columns: list[str]) -> None: """ Preprocesses the PAD-UFES-20 dataset. Args: data_path (str): The base path to the dataset. official_columns (list[str]): The list of official columns for the dataset. """ pad_ufes_20_path = os.path.join(data_path, "PAD-UFES-20") pad_ufes_20_csv_path = os.path.join(pad_ufes_20_path, "metadata.csv") pad_ufes_20_df = pd.read_csv(pad_ufes_20_csv_path) pad_columns = ["MEL", "NV", "BCC", "AK", "BKL", "SCC"] process_client_data( pad_ufes_20_df, "PAD_UFES_20", pad_ufes_20_path, pad_image_path_func, pad_label_map_func, pad_columns, official_columns, )
[docs] def derm7pt_image_path_func(row: pd.Series) -> str: """ Constructs the image path for the Derm7pt dataset. Args: row (pd.Series): A row from the dataframe. Returns: str: The constructed image path. """ return os.path.join("fl4health", "datasets", "skin_cancer", "Derm7pt", "images", row["derm"])
[docs] def derm7pt_label_map_func(row: pd.Series) -> str: """ Maps the original label to the new label for the Derm7pt dataset. Args: row (pd.Series): A row from the dataframe. Returns: str: The mapped label. """ derm7pt_labelmap = { "basal cell carcinoma": "BCC", "blue nevus": "NV", "clark nevus": "NV", "combined nevus": "NV", "congenital nevus": "NV", "dermal nevus": "NV", "dermatofibroma": "DF", # MISC "lentigo": "MISC", "melanoma": "MEL", "melanoma (0.76 to 1.5 mm)": "MEL", "melanoma (in situ)": "MEL", "melanoma (less than 0.76 mm)": "MEL", "melanoma (more than 1.5 mm)": "MEL", "melanoma metastasis": "MEL", "melanosis": "MISC", "miscellaneous": "MISC", "recurrent nevus": "NV", "reed or spitz nevus": "NV", "seborrheic keratosis": "BKL", "vascular lesion": "VASC", # MISC } return derm7pt_labelmap[row["diagnosis"]]
[docs] def preprocess_derm7pt(data_path: str, official_columns: list[str]) -> None: """ Preprocesses the Derm7pt dataset. Args: data_path (str): The base path to the dataset. official_columns (list[str]): The list of official columns for the dataset. """ derm7pt_path = os.path.join(data_path, "Derm7pt") derm7pt_df = pd.read_csv(os.path.join(derm7pt_path, "meta", "meta_core.csv")) derm7pt_columns = ["MEL", "NV", "BCC", "BKL", "DF", "VASC"] process_client_data( derm7pt_df, "Derm7pt", derm7pt_path, derm7pt_image_path_func, derm7pt_label_map_func, derm7pt_columns, official_columns, )
if __name__ == "__main__": data_path = os.path.join("fl4health", "datasets", "skin_cancer") official_columns = ["MEL", "NV", "BCC", "AK", "BKL", "DF", "VASC", "SCC"] preprocess_isic_2019(data_path, official_columns) preprocess_ham10000(data_path, official_columns) preprocess_pad_ufes_20(data_path, official_columns) preprocess_derm7pt(data_path, official_columns)