Source code for fl4health.datasets.skin_cancer.preprocess_skin

"""
The following code is adapted from the preprocess_skin.py script
from the medical_federated GitHub repository by Seongjun Yang et al.

Paper: https://arxiv.org/abs/2207.03075
Code: https://github.com/wns823/medical_federated.git
- medical_federated/skin_cancer_federated/preprocess_skin.py
"""

import json
import os
from collections.abc import Callable
from typing import Any

import pandas as pd


[docs] def save_to_json(data: dict[str, Any], path: str) -> None: """Saves a dictionary to a JSON file. Args: data: A dictionary to save. path: The file path to save the JSON data. """ with open(path, "w", encoding="utf-8") as file: json.dump(data, file, indent="\t")
[docs] def process_client_data( dataframe: pd.DataFrame, client_name: str, data_path: str, image_path_func: Callable[[pd.Series], str], label_map_func: Callable[[pd.Series], str], original_columns: list[str], official_columns: list[str], ) -> None: """Processes and saves the client-specific dataset. Args: dataframe: The dataframe containing the client data. client_name: The name of the client. data_path: The base path to the dataset. image_path_func: A function that constructs the image path from a dataframe row. label_map_func: A function that maps the original label to the new label. original_columns: The list of original columns for the dataset. official_columns: The list of official columns for the dataset. """ preprocessed_data: dict[str, Any] = { "columns": official_columns, "original_columns": original_columns, "data": [], } for i in range(len(dataframe)): row = dataframe.iloc[i] img_path = image_path_func(row) label = label_map_func(row) origin_labels = [0] * len(original_columns) extended_labels = [0] * len(official_columns) origin_labels[original_columns.index(label)] = 1 extended_labels[official_columns.index(label)] = 1 preprocessed_data["data"].append( { "img_path": img_path, "origin_labels": origin_labels, "extended_labels": extended_labels, } ) save_to_json(preprocessed_data, os.path.join(data_path, f"{client_name}.json"))
[docs] def preprocess_isic_2019(data_path: str, official_columns: list[str]) -> None: """Preprocesses the ISIC 2019 dataset. Args: data_path: The base path to the dataset. official_columns: The list of official columns for the dataset. """ Isic_2019_path = os.path.join(data_path, "ISIC_2019") Isic_csv_path = os.path.join(Isic_2019_path, "ISIC_2019_Training_GroundTruth.csv") Isic_df = pd.read_csv(Isic_csv_path) Isic_meta = pd.read_csv(os.path.join(Isic_2019_path, "ISIC_2019_Training_Metadata.csv")) barcelona_list = [i for i in Isic_meta["lesion_id"].dropna() if "BCN" in i] barcelona_core = Isic_meta[Isic_meta["lesion_id"].isin(barcelona_list)] core_2019 = Isic_df[Isic_df["image"].isin(barcelona_core["image"])] core_2019.to_csv(os.path.join(Isic_2019_path, "ISIC_2019_core.csv"), mode="w") Isic_2019_data_path = os.path.join(data_path, "ISIC_2019", "ISIC_2019_Training_Input") Barcelona_df = pd.read_csv(os.path.join(Isic_2019_path, "ISIC_2019_core.csv")) Barcelona_new = Barcelona_df[["image"] + official_columns + ["UNK"]] preprocessed_data: dict[str, Any] = { "columns": official_columns, "original_columns": official_columns, "data": [], } for i in range(len(Barcelona_new)): # Extract the row values, leaving off the last element ("UNK" column) temp = list(Barcelona_new.loc[i].values[:-1]) img_path = os.path.join(Isic_2019_data_path, temp[0] + ".jpg") origin_labels = temp[1:] extended_labels = temp[1:] preprocessed_data["data"].append( { "img_path": img_path, "origin_labels": origin_labels, "extended_labels": extended_labels, } ) save_to_json(preprocessed_data, os.path.join(data_path, "ISIC_2019", "ISIC_19_Barcelona.json"))
[docs] def ham_image_path_func(row: pd.Series) -> str: """Constructs the image path for the HAM10000 dataset. Args: row: A row from the dataframe. Returns: The constructed image path. """ return os.path.join("fl4health", "datasets", "skin_cancer", "HAM10000", row["image_id"] + ".jpg")
[docs] def ham_label_map_func(row: pd.Series) -> str: """Maps the original label to the new label for the HAM10000 dataset. Args: row: A row from the dataframe. Returns: The mapped label. """ Ham_labelmap = { "akiec": "AK", "bcc": "BCC", "bkl": "BKL", "df": "DF", "mel": "MEL", "nv": "NV", "vasc": "VASC", } return Ham_labelmap[row["dx"]]
[docs] def preprocess_ham10000(data_path: str, official_columns: list[str]) -> None: """Preprocesses the HAM10000 dataset. Args: data_path: The base path to the dataset. official_columns: The list of official columns for the dataset. """ Ham_10000_path = os.path.join(data_path, "HAM10000") Ham_csv_path = os.path.join(Ham_10000_path, "HAM10000_metadata") HAM_df = pd.read_csv(Ham_csv_path) Rosendahl_data = HAM_df[HAM_df["dataset"] == "rosendahl"] Rosendahl_data.to_csv(os.path.join(Ham_10000_path, "HAM_rosendahl.csv"), mode="w") Vienna_data = HAM_df[HAM_df["dataset"] != "rosendahl"] Vienna_data.to_csv(os.path.join(Ham_10000_path, "HAM_vienna.csv"), mode="w") Ham_columns = ["MEL", "NV", "BCC", "AK", "BKL", "DF", "VASC"] process_client_data( pd.read_csv(os.path.join(Ham_10000_path, "HAM_rosendahl.csv")), "HAM_rosendahl", Ham_10000_path, ham_image_path_func, ham_label_map_func, Ham_columns, official_columns, ) process_client_data( pd.read_csv(os.path.join(Ham_10000_path, "HAM_vienna.csv")), "HAM_vienna", Ham_10000_path, ham_image_path_func, ham_label_map_func, Ham_columns, official_columns, )
[docs] def pad_image_path_func(row: pd.Series) -> str: """Constructs the image path for the PAD-UFES-20 dataset. Args: row: A row from the dataframe. Returns: The constructed image path. """ return os.path.join("fl4health", "datasets", "skin_cancer", "PAD-UFES-20", row["img_id"])
[docs] def pad_label_map_func(row: pd.Series) -> str: """Maps the original label to the new label for the PAD-UFES-20 dataset. Args: row: A row from the dataframe. Returns: The mapped label. """ Pad_ufes_20_labelmap = { "ACK": "AK", "BCC": "BCC", "MEL": "MEL", "NEV": "NV", "SCC": "SCC", "SEK": "BKL", } return Pad_ufes_20_labelmap[row["diagnostic"]]
[docs] def preprocess_pad_ufes_20(data_path: str, official_columns: list[str]) -> None: """Preprocesses the PAD-UFES-20 dataset. Args: data_path: The base path to the dataset. official_columns: The list of official columns for the dataset. """ Pad_ufes_20_path = os.path.join(data_path, "PAD-UFES-20") Pad_ufes_20_csv_path = os.path.join(Pad_ufes_20_path, "metadata.csv") Pad_ufes_20_df = pd.read_csv(Pad_ufes_20_csv_path) Pad_columns = ["MEL", "NV", "BCC", "AK", "BKL", "SCC"] process_client_data( Pad_ufes_20_df, "PAD_UFES_20", Pad_ufes_20_path, pad_image_path_func, pad_label_map_func, Pad_columns, official_columns, )
[docs] def derm7pt_image_path_func(row: pd.Series) -> str: """Constructs the image path for the Derm7pt dataset. Args: row: A row from the dataframe. Returns: The constructed image path. """ return os.path.join("fl4health", "datasets", "skin_cancer", "Derm7pt", "images", row["derm"])
[docs] def derm7pt_label_map_func(row: pd.Series) -> str: """Maps the original label to the new label for the Derm7pt dataset. Args: row: A row from the dataframe. Returns: The mapped label. """ Derm7pt_labelmap = { "basal cell carcinoma": "BCC", "blue nevus": "NV", "clark nevus": "NV", "combined nevus": "NV", "congenital nevus": "NV", "dermal nevus": "NV", "dermatofibroma": "DF", # MISC "lentigo": "MISC", "melanoma": "MEL", "melanoma (0.76 to 1.5 mm)": "MEL", "melanoma (in situ)": "MEL", "melanoma (less than 0.76 mm)": "MEL", "melanoma (more than 1.5 mm)": "MEL", "melanoma metastasis": "MEL", "melanosis": "MISC", "miscellaneous": "MISC", "recurrent nevus": "NV", "reed or spitz nevus": "NV", "seborrheic keratosis": "BKL", "vascular lesion": "VASC", # MISC } return Derm7pt_labelmap[row["diagnosis"]]
[docs] def preprocess_derm7pt(data_path: str, official_columns: list[str]) -> None: """Preprocesses the Derm7pt dataset. Args: data_path: The base path to the dataset. official_columns: The list of official columns for the dataset. """ Derm7pt_path = os.path.join(data_path, "Derm7pt") Derm7pt_df = pd.read_csv(os.path.join(Derm7pt_path, "meta", "meta_core.csv")) Derm7pt_columns = ["MEL", "NV", "BCC", "BKL", "DF", "VASC"] process_client_data( Derm7pt_df, "Derm7pt", Derm7pt_path, derm7pt_image_path_func, derm7pt_label_map_func, Derm7pt_columns, official_columns, )
if __name__ == "__main__": data_path = os.path.join("fl4health", "datasets", "skin_cancer") official_columns = ["MEL", "NV", "BCC", "AK", "BKL", "DF", "VASC", "SCC"] preprocess_isic_2019(data_path, official_columns) preprocess_ham10000(data_path, official_columns) preprocess_pad_ufes_20(data_path, official_columns) preprocess_derm7pt(data_path, official_columns)