Skip to content

CdaIndividualFactory

Bases: CdaFactory

CdaIndividualFactory creates a GA4GH individual messages from a row of the CDA subject table.

The structure of the CDA subject table is as follows:

- subject_id (*)
- subject_identifier
- species
- sex (*)
- race
- ethnicity
- days_to_birth (*)
- subject_associated_project
- vital_status (*)
- days_to_death (*)
- cause_of_death (*)

(*) indicates a used field.

Source code in src/oncoexporter/cda/cda_individual_factory.py
class CdaIndividualFactory(CdaFactory):
    """
    `CdaIndividualFactory` creates a GA4GH individual messages from a row of the CDA *subject* table.

    The structure of the CDA subject table is as follows:

        - subject_id (*)
        - subject_identifier
        - species
        - sex (*)
        - race
        - ethnicity
        - days_to_birth (*)
        - subject_associated_project
        - vital_status (*)
        - days_to_death (*)
        - cause_of_death (*)

    (*) indicates a used field.
    """

    def __init__(self) -> None:
        self._cause_of_death_mapper = OpCauseOfDeathMapper()
        self._male_sex = {'m', 'male'}
        self._female_sex = {'f', 'female'}

    def _process_vital_status(self, row: pd.Series):
        """
        :param row: a row from the CDA subject table
        :type row: pd.Series
        :returns: A vital status object with information about cause of death if applicable.
        :rtype: PPkt.VitalStatus
        """
        if not isinstance(row, pd.Series):
            raise ValueError(f"'row' argument must be pandas Series but was {type(row)}")
        vital_status = self.get_item(row, "vital_status")
        days_to_death = self.get_item(row, "days_to_death")
        if vital_status is None:
            return None
        valid_status = {"Alive", "Dead"}
        if vital_status not in valid_status:
            return None
        vstatus = PPkt.VitalStatus()
        if vital_status == "Alive":
            vstatus.status = PPkt.VitalStatus.ALIVE
        elif vital_status == "Dead":
            vstatus.status = PPkt.VitalStatus.DECEASED
        if days_to_death is not None:
            try:
                dtd = int(days_to_death)
                vstatus.survival_time_in_days = dtd
            except:
                # TODO: report?
                pass
        cause = self._cause_of_death_mapper.get_ontology_term(row)
        if cause is not None:
            vstatus.cause_of_death.CopyFrom(cause)
        return vstatus

    def to_ga4gh(self, row:pd.Series):
        """
        convert a row from the CDA subject table into an Individual message (GA4GH Phenopacket Schema)

        :param row: a row from the CDA subject table
        :type row: pd.Series
        :returns: A GA4GH Phenopacket Schema Individual object that corresponds to the subject in this row.
        :rtype: PPkt.Individual
        :raises ValueError: if the input is unparsable.
        """
        if not isinstance(row, pd.Series):
            raise ValueError(f"Invalid argument. Expected pandas series but got {type(row)}")
        row = row.astype(str)
        subject_id = row['subject_id']
        # subject_identifier = row['subject_identifier']
        # species = row['species']
        sex = row['sex']
        # race = row['race']
        # ethnicity = row['ethnicity']
        days_to_birth = row['days_to_birth']
        # a valid date looks like this: '-15987.0'
        if days_to_birth.startswith("-"):
            days_to_birth = days_to_birth[1:]
        iso_age = None
        vstat = None
        try:
            # we need to parse '15987.0' first as a float and then transform to int
            d_to_b = int(float(days_to_birth))
            iso_age = self.days_to_iso(days=d_to_b)
            vstat = self._process_vital_status(row)
        except Exception:
            # TODO: handle in a better way
            pass
        # subject_associated_project = row['subject_associated_project']


        individual = PPkt.Individual()
        individual.id = subject_id

        # time_at_last_encounter
        if iso_age is not None:
            individual.time_at_last_encounter.age.iso8601duration = iso_age

        # vital status
        if vstat is not None:
            individual.vital_status.CopyFrom(vstat)

        # sex
        if sex in self._male_sex:
            individual.sex = PPkt.MALE
        elif sex in self._female_sex:
            individual.sex = PPkt.FEMALE
        else:
            individual.sex = PPkt.UNKNOWN_SEX

        # taxonomy, always Homo here
        individual.taxonomy.id = "NCBITaxon:9606"
        individual.taxonomy.label = "Homo sapiens"

        return individual

to_ga4gh(row)

convert a row from the CDA subject table into an Individual message (GA4GH Phenopacket Schema)

Parameters:

Name Type Description Default
row Series

a row from the CDA subject table

required

Returns:

Type Description
PPkt.Individual

A GA4GH Phenopacket Schema Individual object that corresponds to the subject in this row.

Raises:

Type Description
ValueError

if the input is unparsable.

Source code in src/oncoexporter/cda/cda_individual_factory.py
def to_ga4gh(self, row:pd.Series):
    """
    convert a row from the CDA subject table into an Individual message (GA4GH Phenopacket Schema)

    :param row: a row from the CDA subject table
    :type row: pd.Series
    :returns: A GA4GH Phenopacket Schema Individual object that corresponds to the subject in this row.
    :rtype: PPkt.Individual
    :raises ValueError: if the input is unparsable.
    """
    if not isinstance(row, pd.Series):
        raise ValueError(f"Invalid argument. Expected pandas series but got {type(row)}")
    row = row.astype(str)
    subject_id = row['subject_id']
    # subject_identifier = row['subject_identifier']
    # species = row['species']
    sex = row['sex']
    # race = row['race']
    # ethnicity = row['ethnicity']
    days_to_birth = row['days_to_birth']
    # a valid date looks like this: '-15987.0'
    if days_to_birth.startswith("-"):
        days_to_birth = days_to_birth[1:]
    iso_age = None
    vstat = None
    try:
        # we need to parse '15987.0' first as a float and then transform to int
        d_to_b = int(float(days_to_birth))
        iso_age = self.days_to_iso(days=d_to_b)
        vstat = self._process_vital_status(row)
    except Exception:
        # TODO: handle in a better way
        pass
    # subject_associated_project = row['subject_associated_project']


    individual = PPkt.Individual()
    individual.id = subject_id

    # time_at_last_encounter
    if iso_age is not None:
        individual.time_at_last_encounter.age.iso8601duration = iso_age

    # vital status
    if vstat is not None:
        individual.vital_status.CopyFrom(vstat)

    # sex
    if sex in self._male_sex:
        individual.sex = PPkt.MALE
    elif sex in self._female_sex:
        individual.sex = PPkt.FEMALE
    else:
        individual.sex = PPkt.UNKNOWN_SEX

    # taxonomy, always Homo here
    individual.taxonomy.id = "NCBITaxon:9606"
    individual.taxonomy.label = "Homo sapiens"

    return individual