Skip to content

CdaBiosampleFactory

Bases: CdaFactory

Class for creating a Biosample element from a row of the specimen CDA table.

The class returns a GA4GH Biosample object that corresponds to a row of the speciment table. The CDA specimen table has the following fields.

- specimen_id: identifier
- specimen_identifier: structured field with additional information
- specimen_associated_project: e.g., CGCI-HTMCP-CC
- days_to_collection: age in days at time specimen was collected
- primary_disease_type: to be clarified
- anatomical_site: body location at which specimen was collected
- source_material_type: todo
- specimen_type: todo
- derived_from_specimen: todo
- derived_from_subject: todo
- subject_id: todo
- researchsubject_id: todo
Source code in src/oncoexporter/cda/cda_biosample_factory.py
class CdaBiosampleFactory(CdaFactory):
    """
    Class for creating a `Biosample` element from a row of the `specimen` CDA table.

    The class returns a GA4GH Biosample object that corresponds to a row of the speciment table.
    The CDA specimen table has the following fields.

        - specimen_id: identifier
        - specimen_identifier: structured field with additional information
        - specimen_associated_project: e.g., CGCI-HTMCP-CC
        - days_to_collection: age in days at time specimen was collected
        - primary_disease_type: to be clarified
        - anatomical_site: body location at which specimen was collected
        - source_material_type: todo
        - specimen_type: todo
        - derived_from_specimen: todo
        - derived_from_subject: todo
        - subject_id: todo
        - researchsubject_id: todo
    """

    def to_ga4gh(self, row) -> PPKt.Biosample:
        biosample = PPKt.Biosample()

        biosample.id = row['specimen_id']

        derived_from_subj = row['derived_from_subject']
        if derived_from_subj is not None:
            biosample.individual_id = derived_from_subj

        # TODO: Biosample time_of_collection: Age at time sample was collected
        #  -> need subject age + days to collection 
        #     perform this in cda_table_importer.py under "Retrieve GA4GH Biospecimen messages"
        days_to_collection = row['days_to_collection'] # number of days from index date to sample collection date
        if days_to_collection is not None:
            pass
            # need PPKt.iso8601duration where PPKt.OpIndividual.id = biosample.individual_id
            # days_to_coll_td = pd.Timedelta(days=days_to_collection)
            # time_of_coll = PPkt.iso8601duration + days_to_coll_td
            # biosample.time_of_collection = time_of_coll.isoformat()

        # derived_from_specimen -> derived_from_id 
        '''
        Under mapping specimen it says (for GDC): "'specimen_type' is "'sample' or 'portion' or 'slide' 
         or 'analyte' or 'aliquot'" and 
         'derived_from_specimen' is "'initial specimen' if specimen_type is 'sample'; 
         otherwise Specimen.id for parent Specimen record".

         Note: may want to add a check that specimen_type from CDA is 'sample' if derived_from is 'initial specimen'
        '''
        derived_from = row['derived_from_specimen']    
        if derived_from is not None:  
            if derived_from != 'initial specimen':  
                biosample.derived_from_id = derived_from

        # anatomical_site -> sampled_tissue
        sampled_tissue = _map_anatomical_site(row['anatomical_site'])
        if sampled_tissue is not None:
            biosample.sampled_tissue.CopyFrom(sampled_tissue)

        sample_type = _map_specimen_type(row['specimen_type'])
        if sample_type is not None:
            biosample.sample_type.CopyFrom(sample_type)

        biosample.taxonomy.CopyFrom(HOMO_SAPIENS)

        # primary_disease_type -> histological_diagnosis
        histological_diagnosis = _map_primary_disease_type(row['primary_disease_type'])
        if histological_diagnosis is not None:
            biosample.histological_diagnosis.CopyFrom(histological_diagnosis)

        material_sample = _map_source_material_type(row['source_material_type'])
        if material_sample is not None:
            biosample.material_sample.CopyFrom(material_sample)

        return biosample