Skip to content

CdaDiseaseFactory

Bases: CdaFactory

CdaDiseaseFactory uses both the diagnosis and researchsubject tables to format the information about the disease diagnosis into the Disease element of the Phenopacket Schema.

Note, CdaDiseaseFactory interprets the age_at_diagnosis as the age of onset.

  • 'primary_diagnosis'
  • 'primary_diagnosis_site'
  • 'primary_diagnosis_condition'
  • 'stage'
  • 'age_at_diagnosis'

Parameters:

Name Type Description Default
disease_term_mapper OpMapper

an :class:OpMapper for finding the disease term in the row fields.

required
Source code in src/oncoexporter/cda/cda_disease_factory.py
class CdaDiseaseFactory(CdaFactory):
    """
    `CdaDiseaseFactory` uses both the `diagnosis` and `researchsubject` tables to format the information
    about the disease diagnosis into the Disease element of the Phenopacket Schema.

    Note, `CdaDiseaseFactory` interprets the `age_at_diagnosis` as the age of onset.

    - 'primary_diagnosis'
    - 'primary_diagnosis_site'
    - 'primary_diagnosis_condition'
    - 'stage'
    - 'age_at_diagnosis'

    :param disease_term_mapper: an :class:`OpMapper` for finding the disease term in the row fields.
    """

    def __init__(self, disease_term_mapper: OpMapper):
        self._disease_term_mapper = disease_term_mapper
        self._stage_mapper = OpDiseaseStageMapper()
        self._uberon_mapper = OpUberonMapper()

        self._required_fields = tuple(set(itertools.chain(
            self._disease_term_mapper.get_fields(),
            self._stage_mapper.get_fields(),
            self._uberon_mapper.get_fields(),
            ('age_at_diagnosis',),
        )))
        # todo -- add in ICCDO Mapper


    def to_ga4gh(self, row: pd.Series) -> pp.Disease:
        """
        Convert a row of the table obtained by merging CDA `diagnosis` and `researchsubject` tables into a Disease
         message of the Phenopacket Schema.

        The row is expected to contain the following columns:
        - 'stage'
        - 'primary_diagnosis_condition'
        - 'primary_diagnosis_site'
        - 'primary_diagnosis'
        - 'age_at_diagnosis'

        :param row: a :class:`pd.Series` with a row from the merged CDA table.
        """
        if not isinstance(row, pd.Series):
            raise ValueError(f"Invalid argument. Expected pandas Series but got {type(row)}")

        if any(field not in row for field in self._required_fields):
            #missing = row.index.difference(self._required_fields) # this gets items in row not in _required_fields but we want the opposite
            missing = []
            print(row.index)
            for i in self._required_fields:
                print('i:', i)
                if i not in row.index:
                    print('not in row.index')
                    missing.append(i)

            raise ValueError(f'Required field(s) are missing: {missing}')

        # This is the component we build here.
        disease = pp.Disease()

        term = self._disease_term_mapper.get_ontology_term(row=row)
        if term is None:
            # `term` is a required field.
            raise ValueError(f'Could not parse `term` from the row {row}')
        disease.term.CopyFrom(term)

        # We will interpret age_at_diagnosis as age of onset

        # raise ValueError(f"days argument must be an int or a str but was {type(days)}")
        # ValueError: days argument must be an int or a str but was <class 'pandas._libs.missing.NAType'>
        iso8601_age_of_onset = self.days_to_iso(str(row['age_at_diagnosis']))
        if iso8601_age_of_onset is not None:
            disease.onset.age.iso8601duration = iso8601_age_of_onset

        # Deal with stage
        stage = self._stage_mapper.get_ontology_term(row=row)
        if stage is not None:
            disease.disease_stage.append(stage)

        primary_site = self._uberon_mapper.get_ontology_term(row)
        if primary_site is not None:
            disease.primary_site.CopyFrom(primary_site)

        # Deal with morphology - clinical_tnm_finding_list seems like the most
        # appropriate place to put this
        # TODO -- work out where this goes. I do not think the ICDO will give us TNM
        # clinical_tnm_finding_list = None #self._parse_morphology_into_ontology_term(row)

        return disease

to_ga4gh(row)

Convert a row of the table obtained by merging CDA diagnosis and researchsubject tables into a Disease message of the Phenopacket Schema.

The row is expected to contain the following columns: - 'stage' - 'primary_diagnosis_condition' - 'primary_diagnosis_site' - 'primary_diagnosis' - 'age_at_diagnosis'

Parameters:

Name Type Description Default
row Series

a :class:pd.Series with a row from the merged CDA table.

required
Source code in src/oncoexporter/cda/cda_disease_factory.py
def to_ga4gh(self, row: pd.Series) -> pp.Disease:
    """
    Convert a row of the table obtained by merging CDA `diagnosis` and `researchsubject` tables into a Disease
     message of the Phenopacket Schema.

    The row is expected to contain the following columns:
    - 'stage'
    - 'primary_diagnosis_condition'
    - 'primary_diagnosis_site'
    - 'primary_diagnosis'
    - 'age_at_diagnosis'

    :param row: a :class:`pd.Series` with a row from the merged CDA table.
    """
    if not isinstance(row, pd.Series):
        raise ValueError(f"Invalid argument. Expected pandas Series but got {type(row)}")

    if any(field not in row for field in self._required_fields):
        #missing = row.index.difference(self._required_fields) # this gets items in row not in _required_fields but we want the opposite
        missing = []
        print(row.index)
        for i in self._required_fields:
            print('i:', i)
            if i not in row.index:
                print('not in row.index')
                missing.append(i)

        raise ValueError(f'Required field(s) are missing: {missing}')

    # This is the component we build here.
    disease = pp.Disease()

    term = self._disease_term_mapper.get_ontology_term(row=row)
    if term is None:
        # `term` is a required field.
        raise ValueError(f'Could not parse `term` from the row {row}')
    disease.term.CopyFrom(term)

    # We will interpret age_at_diagnosis as age of onset

    # raise ValueError(f"days argument must be an int or a str but was {type(days)}")
    # ValueError: days argument must be an int or a str but was <class 'pandas._libs.missing.NAType'>
    iso8601_age_of_onset = self.days_to_iso(str(row['age_at_diagnosis']))
    if iso8601_age_of_onset is not None:
        disease.onset.age.iso8601duration = iso8601_age_of_onset

    # Deal with stage
    stage = self._stage_mapper.get_ontology_term(row=row)
    if stage is not None:
        disease.disease_stage.append(stage)

    primary_site = self._uberon_mapper.get_ontology_term(row)
    if primary_site is not None:
        disease.primary_site.CopyFrom(primary_site)

    # Deal with morphology - clinical_tnm_finding_list seems like the most
    # appropriate place to put this
    # TODO -- work out where this goes. I do not think the ICDO will give us TNM
    # clinical_tnm_finding_list = None #self._parse_morphology_into_ontology_term(row)

    return disease