Source code for gpsea.analysis.temporal._api

import abc
import math
import typing

from collections import defaultdict

import pandas as pd
import scipy.stats

from gpsea.model import Patient

from ._base import Survival
from ._util import prepare_censored_data
from .stats import SurvivalStatistic

from ..clf import GenotypeClassifier
from .._base import MonoPhenotypeAnalysisResult, StatisticResult, AnalysisException
from .._partition import ContinuousPartitioning


[docs] class Endpoint(ContinuousPartitioning, metaclass=abc.ABCMeta): """ `Endpoint` computes survival for the analyzed individual. Example endpoints include :func:`~gpsea.analysis.temporal.endpoint.death`, :func:`~gpsea.analysis.temporal.endpoint.disease_onset`, or onset of a phenotypic feature (:func:`~gpsea.analysis.temporal.endpoint.hpo_onset`). """
[docs] @abc.abstractmethod def compute_survival( self, patient: Patient, ) -> typing.Optional[Survival]: """ Compute a survival for a given `patient` or `None` if the `patient` lacks the required data (e.g. age of death or age at last investigation). """ pass
[docs] class SurvivalAnalysisResult(MonoPhenotypeAnalysisResult): """ `SurvivalAnalysisResult` includes the results of a :class:`~gpsea.analysis.temporal.SurvivalAnalysis`. The genotype categories and survival are reported in the `data` data frame with the following structure: ============ =========== ============================================ patient_id genotype phenotype ============ =========== ============================================ patient_1 0 `Survival(value=123.4, is_censored=False)` patient_2 0 `None` patient_3 `None` `Survival(value=456.7, is_censored=True)` patient_4 1 `None` ... ... ... ============ =========== ============================================ The index includes the individual IDs (`patient_id`), and then there are 2 columns with the `genotype` group id (:attr:`~gpsea.analysis.clf.PatientCategory.cat_id`) and the `phenotype` with the survival represented as :class:`~gpsea.analysis.temporal.Survival` object. A `genotype` value may be missing (`None`) if the individual cannot be assigned into a genotype category. Similarly, a `survival` is `None` if computing the survival for an individual is impossible. """ def __init__( self, gt_clf: GenotypeClassifier, endpoint: Endpoint, statistic: SurvivalStatistic, data: pd.DataFrame, statistic_result: StatisticResult, ): super().__init__( gt_clf=gt_clf, phenotype=endpoint, statistic=statistic, data=data, statistic_result=statistic_result, ) assert isinstance(endpoint, Endpoint) @property def endpoint(self) -> Endpoint: """ Get the endpoint used to compute the survival of the individuals. """ # We are sure that `self._phenotype` is assignable to `Endpoint` # because of the instance check in `__init__` and `Endpoint` # being a subclass of `Partitioning`. return self._phenotype # type: ignore
[docs] def plot_kaplan_meier_curves( self, ax, ): """ Plot genotype group survivals on the provided axes. The axes includes legend. However, if no survival is available for a genotype group, the group name will be missing from the legend. :param ax: a Matplotlib `Axes` to draw on. """ for pat_cat in self._gt_clf.get_categories(): survivals = self._data.loc[ self._data[MonoPhenotypeAnalysisResult.GT_COL] == pat_cat.cat_id, MonoPhenotypeAnalysisResult.PH_COL, ] non_na = survivals[survivals.notna()] if len(non_na) > 0: censored_data = prepare_censored_data(survivals=non_na) data = scipy.stats.ecdf(censored_data) data.sf.plot(ax, label=pat_cat.name) ax.legend()
def __eq__(self, value: object) -> bool: return isinstance(value, SurvivalAnalysisResult) and super( MonoPhenotypeAnalysisResult, self ).__eq__(value) def __hash__(self) -> int: return super(MonoPhenotypeAnalysisResult, self).__hash__() def __str__(self) -> str: return ( "SurvivalAnalysisResult(" f"gt_clf={self._gt_clf}, " f"endpoint={self._phenotype}, " f"statistic={self._statistic}, " f"data={self._data}, " f"statistic_result={self._statistic_result})" ) def __repr__(self) -> str: return str(self)
[docs] class SurvivalAnalysis: """ `SurvivalAnalysis` compares the survivals of genotype groups with respect to an :class:`~gpsea.analysis.temporal.Endpoint`. The cohort is partitioned into groups using a genotype predicate and survival is computed for each cohort member. The difference between survivals is tested with selected :class:`~gpsea.analysis.temporal.stats.SurvivalStatistic`. The analysis may raise an :class:`~gpsea.analysis.AnalysisException` if issues are encountered. The exception includes the reason(s) in `args` as well as any partial data, to help with troubleshooting. """ def __init__( self, statistic: SurvivalStatistic, ): assert isinstance(statistic, SurvivalStatistic) self._statistic = statistic
[docs] def compare_genotype_vs_survival( self, cohort: typing.Iterable[Patient], gt_clf: GenotypeClassifier, endpoint: Endpoint, ) -> SurvivalAnalysisResult: """ Execute the survival analysis on a given `cohort`. """ idx = pd.Index((patient.patient_id for patient in cohort), name="patient_id") data = pd.DataFrame( None, index=idx, columns=MonoPhenotypeAnalysisResult.DATA_COLUMNS, ) survivals = defaultdict(list) # Apply the predicate and the survival metric on the cohort for patient in cohort: gt_cat = gt_clf.test(patient) if gt_cat is None: data.loc[patient.patient_id, MonoPhenotypeAnalysisResult.GT_COL] = None else: data.loc[patient.patient_id, MonoPhenotypeAnalysisResult.GT_COL] = ( gt_cat.category.cat_id ) survival = endpoint.compute_survival(patient) data.loc[patient.patient_id, MonoPhenotypeAnalysisResult.PH_COL] = survival # type: ignore if gt_cat is not None and survival is not None: survivals[gt_cat].append(survival) vals = tuple(survivals[gt_cat] for gt_cat in gt_clf.get_categorizations()) result = self._statistic.compute_pval(vals) if math.isnan(result.pval): raise AnalysisException( dict(data=data), "The survival values did not meet the expectation of the statistical test!", ) return SurvivalAnalysisResult( gt_clf=gt_clf, endpoint=endpoint, statistic=self._statistic, data=data, statistic_result=result, )