Source code for pystatsbio.diagnostic._cutoff
"""Optimal cutoff selection for diagnostic tests.
Three methods: Youden index (maximize sensitivity + specificity − 1),
closest-to-top-left (minimize Euclidean distance to the (0,1) corner),
and cost-based (minimize weighted misclassification cost given
prevalence).
Validates against: R ``OptimalCutpoints::optimal.cutpoints()``.
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from numpy.typing import NDArray
from pystatsbio.diagnostic._common import ROCResult
[docs]
@dataclass(frozen=True)
class CutoffResult:
"""Result of optimal cutoff selection."""
cutoff: float
sensitivity: float
specificity: float
method: str # 'youden', 'closest_topleft', 'cost'
criterion_value: float # value of the optimization criterion
[docs]
def optimal_cutoff(
roc_result: ROCResult,
*,
method: str = "youden",
cost_fp: float = 1.0,
cost_fn: float = 1.0,
prevalence: float | None = None,
) -> CutoffResult:
"""Find optimal classification cutoff from an ROC curve.
Parameters
----------
roc_result : ROCResult
A computed ROC curve.
method : str
``'youden'`` — maximize sensitivity + specificity − 1.
``'closest_topleft'`` — minimize distance to ``(FPR=0, TPR=1)``.
``'cost'`` — minimize weighted misclassification cost.
cost_fp, cost_fn : float
Costs of false positives and false negatives (for ``method='cost'``).
prevalence : float or None
Disease prevalence (for ``method='cost'``). Uses sample
prevalence ``n_positive / (n_positive + n_negative)`` if
``None``.
Returns
-------
CutoffResult
Validates against: R ``OptimalCutpoints::optimal.cutpoints()``
"""
valid_methods = ("youden", "closest_topleft", "cost")
if method not in valid_methods:
raise ValueError(
f"method must be one of {valid_methods}, got {method!r}"
)
tpr = roc_result.tpr
fpr = roc_result.fpr
thresholds = roc_result.thresholds
# Exclude the boundary points (inf / -inf) from candidate set
# because they correspond to "classify nobody" or "classify everybody"
finite_mask = np.isfinite(thresholds)
if finite_mask.sum() == 0:
raise ValueError("ROC result has no finite thresholds")
tpr_f = tpr[finite_mask]
fpr_f = fpr[finite_mask]
thresh_f = thresholds[finite_mask]
spec_f = 1.0 - fpr_f
if method == "youden":
# J = sens + spec - 1 = TPR - FPR
criterion = tpr_f - fpr_f
best_idx = int(np.argmax(criterion))
crit_val = float(criterion[best_idx])
elif method == "closest_topleft":
# Euclidean distance to (FPR=0, TPR=1)
dist = np.sqrt(fpr_f ** 2 + (1.0 - tpr_f) ** 2)
best_idx = int(np.argmin(dist))
crit_val = float(dist[best_idx])
else: # cost
if prevalence is None:
prev = roc_result.n_positive / (
roc_result.n_positive + roc_result.n_negative
)
else:
prev = prevalence
# Expected cost = cost_fp * FPR * (1-prev) + cost_fn * (1-TPR) * prev
cost = cost_fp * fpr_f * (1 - prev) + cost_fn * (1 - tpr_f) * prev
best_idx = int(np.argmin(cost))
crit_val = float(cost[best_idx])
return CutoffResult(
cutoff=float(thresh_f[best_idx]),
sensitivity=float(tpr_f[best_idx]),
specificity=float(spec_f[best_idx]),
method=method,
criterion_value=crit_val,
)