Source code for atomgen.data.utils

"""Utilities for data processing and evaluation."""

from typing import Any, Dict

import numpy as np
from scipy.special import expit, softmax
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import accuracy_score, roc_auc_score
from transformers import EvalPrediction


[docs] def compute_metrics_smp(eval_pred: EvalPrediction) -> Dict[str, Any]: """Compute MAE for 20 regression labels for the SMP task.""" pred = eval_pred.predictions label = eval_pred.label_ids # get Mean absolute error for each 20 pred and labels maes = { "rot_const_A": None, "rot_const_B": None, "rot_const_C": None, "dipole_moment": None, "isotropic_polarizability": None, "HOMO": None, "LUMO": None, "gap": None, "electronic_spatial_extent": None, "zero_point_vib_energy": None, "internal_energy_0K": None, "internal_energy_298.15K": None, "enthalpy_298.15K": None, "free_energy_298.15K": None, "heat_capacity_298.15K": None, "thermochem_internal_energy_0K": None, "thermochem_internal_energy_298.15K": None, "thermochem_enthalpy_298.15K": None, "thermochem_free_energy_298.15K": None, "thermochem_heat_capacity_298.15K": None, } for i in range(20): value = np.mean(np.abs(pred[:, i] - label[:, i])) maes[list(maes.keys())[i]] = value return maes
[docs] def compute_metrics_ppi(eval_pred: EvalPrediction) -> Dict[str, Any]: """Compute AUROC for the PIP task.""" pred = expit(eval_pred.predictions > 0.5) label = eval_pred.label_ids # compute AUROC for each label for i in range(20): auroc = roc_auc_score(label[:, i], pred[:, i]) return {"auroc": auroc}
[docs] def compute_metrics_res(eval_pred: EvalPrediction) -> Dict[str, Any]: """Compute accuracy for the RES task.""" pred = softmax(eval_pred.predictions).argmax(axis=1) label = eval_pred.label_ids # compute accuracy acc = accuracy_score(label, pred) return {"accuracy": acc}
[docs] def compute_metrics_msp(eval_pred: EvalPrediction) -> Dict[str, Any]: """Compute AUROC for the MSP task.""" pred = eval_pred.predictions label = eval_pred.label_ids # compute AUROC for each label auroc = roc_auc_score(label, pred) return {"auroc": auroc}
[docs] def compute_metrics_lba(eval_pred: EvalPrediction) -> Dict[str, Any]: """Compute RMSE for the LBA task.""" pred = eval_pred.predictions label = eval_pred.label_ids # compute RMSE for each label rmse = np.sqrt(np.mean((pred - label) ** 2)) global_pearson = pearsonr(pred.flatten(), label.flatten())[0] global_spearman = spearmanr(pred.flatten(), label.flatten())[0] return { "rmse": rmse, "global_pearson": global_pearson, "global_spearman": global_spearman, }
[docs] def compute_metrics_lep(eval_pred: EvalPrediction) -> Dict[str, Any]: """Compute AUROC for the LEP task.""" pred = expit(eval_pred.predictions) > 0.5 label = eval_pred.label_ids # compute AUROC for each label auroc = roc_auc_score(label, pred) return {"auroc": auroc}
[docs] def compute_metrics_psr(eval_pred: EvalPrediction) -> Dict[str, Any]: """Compute global spearman correlation for the PSR task.""" pred = eval_pred.predictions label = eval_pred.label_ids # compute global spearman correlation global_spearman = spearmanr(pred.flatten(), label.flatten())[0] return {"global_spearman": global_spearman}
[docs] def compute_metrics_rsr(eval_pred: EvalPrediction) -> Dict[str, Any]: """Compute global spearman correlation for the RSR task.""" pred = eval_pred.predictions label = eval_pred.label_ids # compute global spearman correlation global_spearman = spearmanr(pred.flatten(), label.flatten())[0] return {"global_spearman": global_spearman}