Skip to content

CdaDiseaseFactory

Bases: CdaFactory

CdaDiseaseFactory uses both the diagnosis and researchsubject tables to format the information about the disease diagnosis into the Disease element of the Phenopacket Schema.

Note, CdaDiseaseFactory interprets the age_at_diagnosis as the age of onset.

  • 'primary_diagnosis'
  • 'primary_diagnosis_site'
  • 'primary_diagnosis_condition'
  • 'stage' (For GDC entries, we have to get the stage from the GDC API, as diagnosis.tumor_stage, which CDA gets from GDC, is empty)
  • 'age_at_diagnosis'

Parameters:

Name Type Description Default
disease_term_mapper OpMapper

an :class:OpMapper for finding the disease term in the row fields. (called in mapper._configure.py)

required
Source code in src/oncoexporter/cda/cda_disease_factory.py
class CdaDiseaseFactory(CdaFactory):
    """
    `CdaDiseaseFactory` uses both the `diagnosis` and `researchsubject` tables to format the information
    about the disease diagnosis into the Disease element of the Phenopacket Schema.

    Note, `CdaDiseaseFactory` interprets the `age_at_diagnosis` as the age of onset.

    - 'primary_diagnosis'
    - 'primary_diagnosis_site'
    - 'primary_diagnosis_condition'
    - 'stage' (For GDC entries, we have to get the stage from the GDC API, as diagnosis.tumor_stage, which CDA gets from GDC, is empty)
    - 'age_at_diagnosis'

    :param disease_term_mapper: an :class:`OpMapper` for finding the disease term in the row fields.
    (called in mapper._configure.py)
    """

    def __init__(self, disease_term_mapper: OpMapper):
        self._gdc_service = GdcService()
        self._disease_term_mapper = disease_term_mapper
        self._stage_mapper = OpDiseaseStageMapper()
        self._uberon_mapper = OpUberonMapper()

        self._required_fields = tuple(set(itertools.chain(
            self._disease_term_mapper.get_fields(),
            self._stage_mapper.get_fields(),
            self._uberon_mapper.get_fields(),
            ('age_at_diagnosis',),
        )))
        # todo -- add in ICCDO Mapper


    def to_ga4gh(self, row: pd.Series) -> pp.Disease:
        """
        Convert a row of the table obtained by merging CDA `diagnosis` and `researchsubject` tables into a Disease
         message of the Phenopacket Schema.

        The row is expected to contain the following columns:
        - 'stage'
        - 'primary_diagnosis_condition'
        - 'primary_diagnosis_site'
        - 'primary_diagnosis'
        - 'age_at_diagnosis'

        :param row: a :class:`pd.Series` with a row from the merged CDA table.
        """
        if not isinstance(row, pd.Series):
            raise ValueError(f"Invalid argument. Expected pandas Series but got {type(row)}")

        if any(field not in row for field in self._required_fields):
            #missing = row.index.difference(self._required_fields) # this gets items in row not in _required_fields but we want the opposite
            missing = []
            #print(row.index)
            for i in self._required_fields:
                print('i:', i)
                if i not in row.index:
                    print('not in row.index')
                    missing.append(i)

            raise ValueError(f'Required field(s) are missing: {missing}')

        # This is the component we build here.
        disease = pp.Disease()

        # map the disease term to NCIT
        term = self._disease_term_mapper.get_ontology_term(row=row)
        if term is None:
            # `term` is a required field.
            raise ValueError(f'Could not parse `term` from the row {row}')
        disease.term.CopyFrom(term)

        # We will interpret age_at_diagnosis as age of onset
        iso8601_age_of_onset = self.days_to_iso(str(row['age_at_diagnosis']))
        if iso8601_age_of_onset is not None:
            disease.onset.age.iso8601duration = iso8601_age_of_onset

        '''
        Deal with stage
         add stage if from GDC: diagnoses.tumor_stage is not filled in in GDC, so tumor_stage coming from CDA is empty
            can use diagnoses.ajcc_pathologic_stage instead (other alternatives are diagnoses.ajcc_clinical_stage, diagnoses.ann_arbor_pathologic_stage,
            diagnoses.ann_arbor_clinical_stage, but ajcc_pathologic_stage is the most prevalent in GDC) 

            Note: only selecting the stage from the 1st diagnosis out of potentially 4:
            diagnoses.0.diagnosis_id	diagnoses.1.diagnosis_id	diagnoses.2.diagnosis_id	diagnoses.3.diagnosis_id	
        '''
        #print("\n\nrow[subject_id]",row['subject_id'])
        #print("row:", list(row))
        #print("cda stage:", row["stage"]) # empty if coming from GDC
        stage_str = ''

        if row["stage"] == '': # probably should put in a check for data source here
            subj_id = re.sub("^[^.]+\.", "", row["subject_id"]) # remove initial data source label
            gdc_stage = self._gdc_service.fetch_stage(subj_id) # returns a string
            stage_str = gdc_stage
        else:
            stage_str = row["stage"]

        # map to ontology:
        stage = self._stage_mapper.get_ontology_term(stage_str=stage_str) # returns ontology_term = PPkt.OntologyClass()
        if stage is not None:
            disease.disease_stage.append(stage) # list, so use append instead of CopyFrom
        ###

        # map primary site to uberon        
        primary_site = self._uberon_mapper.get_ontology_term(row)
        if primary_site is not None:
            disease.primary_site.CopyFrom(primary_site)

        # Deal with morphology - clinical_tnm_finding_list seems like the most
        # appropriate place to put this
        # TODO -- work out where this goes. I do not think the ICDO will give us TNM
        # clinical_tnm_finding_list = None #self._parse_morphology_into_ontology_term(row)

        return disease

to_ga4gh(row)

Convert a row of the table obtained by merging CDA diagnosis and researchsubject tables into a Disease message of the Phenopacket Schema.

The row is expected to contain the following columns: - 'stage' - 'primary_diagnosis_condition' - 'primary_diagnosis_site' - 'primary_diagnosis' - 'age_at_diagnosis'

Parameters:

Name Type Description Default
row Series

a :class:pd.Series with a row from the merged CDA table.

required
Source code in src/oncoexporter/cda/cda_disease_factory.py
def to_ga4gh(self, row: pd.Series) -> pp.Disease:
    """
    Convert a row of the table obtained by merging CDA `diagnosis` and `researchsubject` tables into a Disease
     message of the Phenopacket Schema.

    The row is expected to contain the following columns:
    - 'stage'
    - 'primary_diagnosis_condition'
    - 'primary_diagnosis_site'
    - 'primary_diagnosis'
    - 'age_at_diagnosis'

    :param row: a :class:`pd.Series` with a row from the merged CDA table.
    """
    if not isinstance(row, pd.Series):
        raise ValueError(f"Invalid argument. Expected pandas Series but got {type(row)}")

    if any(field not in row for field in self._required_fields):
        #missing = row.index.difference(self._required_fields) # this gets items in row not in _required_fields but we want the opposite
        missing = []
        #print(row.index)
        for i in self._required_fields:
            print('i:', i)
            if i not in row.index:
                print('not in row.index')
                missing.append(i)

        raise ValueError(f'Required field(s) are missing: {missing}')

    # This is the component we build here.
    disease = pp.Disease()

    # map the disease term to NCIT
    term = self._disease_term_mapper.get_ontology_term(row=row)
    if term is None:
        # `term` is a required field.
        raise ValueError(f'Could not parse `term` from the row {row}')
    disease.term.CopyFrom(term)

    # We will interpret age_at_diagnosis as age of onset
    iso8601_age_of_onset = self.days_to_iso(str(row['age_at_diagnosis']))
    if iso8601_age_of_onset is not None:
        disease.onset.age.iso8601duration = iso8601_age_of_onset

    '''
    Deal with stage
     add stage if from GDC: diagnoses.tumor_stage is not filled in in GDC, so tumor_stage coming from CDA is empty
        can use diagnoses.ajcc_pathologic_stage instead (other alternatives are diagnoses.ajcc_clinical_stage, diagnoses.ann_arbor_pathologic_stage,
        diagnoses.ann_arbor_clinical_stage, but ajcc_pathologic_stage is the most prevalent in GDC) 

        Note: only selecting the stage from the 1st diagnosis out of potentially 4:
        diagnoses.0.diagnosis_id	diagnoses.1.diagnosis_id	diagnoses.2.diagnosis_id	diagnoses.3.diagnosis_id	
    '''
    #print("\n\nrow[subject_id]",row['subject_id'])
    #print("row:", list(row))
    #print("cda stage:", row["stage"]) # empty if coming from GDC
    stage_str = ''

    if row["stage"] == '': # probably should put in a check for data source here
        subj_id = re.sub("^[^.]+\.", "", row["subject_id"]) # remove initial data source label
        gdc_stage = self._gdc_service.fetch_stage(subj_id) # returns a string
        stage_str = gdc_stage
    else:
        stage_str = row["stage"]

    # map to ontology:
    stage = self._stage_mapper.get_ontology_term(stage_str=stage_str) # returns ontology_term = PPkt.OntologyClass()
    if stage is not None:
        disease.disease_stage.append(stage) # list, so use append instead of CopyFrom
    ###

    # map primary site to uberon        
    primary_site = self._uberon_mapper.get_ontology_term(row)
    if primary_site is not None:
        disease.primary_site.CopyFrom(primary_site)

    # Deal with morphology - clinical_tnm_finding_list seems like the most
    # appropriate place to put this
    # TODO -- work out where this goes. I do not think the ICDO will give us TNM
    # clinical_tnm_finding_list = None #self._parse_morphology_into_ontology_term(row)

    return disease