Skip to content

Phenopacket utils

GeneIdentifierUpdater

Class for updating gene identifiers within genomic interpretations.

Source code in src/pheval/utils/phenopacket_utils.py
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
class GeneIdentifierUpdater:
    """Class for updating gene identifiers within genomic interpretations."""

    def __init__(self, gene_identifier: str, hgnc_data: dict = None, identifier_map: dict = None):
        """
        Initialise the GeneIdentifierUpdater.

        Args:
            gene_identifier (str): The gene identifier to update to.
            hgnc_data (dict): A dictionary containing HGNC data (default: None).
            identifier_map (dict): A dictionary mapping gene identifiers (default: None).
        """

        self.hgnc_data = hgnc_data
        self.gene_identifier = gene_identifier
        self.identifier_map = identifier_map

    def find_identifier(self, gene_symbol: str) -> str:
        """
        Find the specified gene identifier for a gene symbol.

        Args:
            gene_symbol (str): The gene symbol to find the identifier for.

        Returns:
            str: The identified gene identifier.
        """
        if gene_symbol in self.hgnc_data.keys():
            return self.hgnc_data[gene_symbol][self.gene_identifier]
        else:
            for _symbol, data in self.hgnc_data.items():
                for prev_symbol in data["previous_symbol"]:
                    if prev_symbol == gene_symbol:
                        return data[self.gene_identifier]

    def obtain_gene_symbol_from_identifier(self, query_gene_identifier: str) -> str:
        """
        Obtain gene symbol from a gene identifier.

        Args:
            query_gene_identifier (str): The gene identifier.

        Returns:
            str: The gene symbol corresponding to the identifier.
        """
        return self.identifier_map[query_gene_identifier]

    def _find_alternate_ids(self, gene_symbol: str) -> List[str]:
        """
        Find the alternate IDs for a gene symbol.

        Args:
            gene_symbol (str): The gene symbol to find alternate IDs for.

        Returns:
            List[str]: List of alternate IDs for the gene symbol.
        """
        if gene_symbol in self.hgnc_data.keys():
            return [
                self.hgnc_data[gene_symbol]["hgnc_id"],
                "ncbigene:" + self.hgnc_data[gene_symbol]["entrez_id"],
                "ensembl:" + self.hgnc_data[gene_symbol]["ensembl_id"],
                "symbol:" + gene_symbol,
            ]
        else:
            for symbol, data in self.hgnc_data.items():
                for prev_symbol in data["previous_symbol"]:
                    if prev_symbol == gene_symbol:
                        return [
                            data["hgnc_id"],
                            "ncbigene:" + data["entrez_id"],
                            "ensembl:" + data["ensembl_id"],
                            "symbol:" + symbol,
                        ]

    def update_genomic_interpretations_gene_identifier(
        self, interpretations: List[Interpretation], phenopacket_path: Path
    ) -> List[Interpretation]:
        """
        Update the genomic interpretations of a Phenopacket.

        Args:
            interpretations (List[Interpretation]): List of Interpretation objects.

        Returns:
            List[Interpretation]: Updated list of Interpretation objects.
        """
        updated_interpretations = copy(list(interpretations))
        for updated_interpretation in updated_interpretations:
            for g in updated_interpretation.diagnosis.genomic_interpretations:
                updated_gene_identifier = self.find_identifier(
                    g.variant_interpretation.variation_descriptor.gene_context.symbol
                )
                info_log.info(
                    f"Updating gene identifier in {phenopacket_path} from "
                    f"{g.variant_interpretation.variation_descriptor.gene_context.value_id}"
                    f"to {updated_gene_identifier}"
                )
                g.variant_interpretation.variation_descriptor.gene_context.value_id = (
                    updated_gene_identifier
                )
                del g.variant_interpretation.variation_descriptor.gene_context.alternate_ids[:]
                g.variant_interpretation.variation_descriptor.gene_context.alternate_ids.extend(
                    self._find_alternate_ids(
                        g.variant_interpretation.variation_descriptor.gene_context.symbol
                    )
                )
        return updated_interpretations

__init__(gene_identifier, hgnc_data=None, identifier_map=None)

Initialise the GeneIdentifierUpdater.

Parameters:

Name Type Description Default
gene_identifier str

The gene identifier to update to.

required
hgnc_data dict

A dictionary containing HGNC data (default: None).

None
identifier_map dict

A dictionary mapping gene identifiers (default: None).

None
Source code in src/pheval/utils/phenopacket_utils.py
639
640
641
642
643
644
645
646
647
648
649
650
651
def __init__(self, gene_identifier: str, hgnc_data: dict = None, identifier_map: dict = None):
    """
    Initialise the GeneIdentifierUpdater.

    Args:
        gene_identifier (str): The gene identifier to update to.
        hgnc_data (dict): A dictionary containing HGNC data (default: None).
        identifier_map (dict): A dictionary mapping gene identifiers (default: None).
    """

    self.hgnc_data = hgnc_data
    self.gene_identifier = gene_identifier
    self.identifier_map = identifier_map

find_identifier(gene_symbol)

Find the specified gene identifier for a gene symbol.

Parameters:

Name Type Description Default
gene_symbol str

The gene symbol to find the identifier for.

required

Returns:

Name Type Description
str str

The identified gene identifier.

Source code in src/pheval/utils/phenopacket_utils.py
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
def find_identifier(self, gene_symbol: str) -> str:
    """
    Find the specified gene identifier for a gene symbol.

    Args:
        gene_symbol (str): The gene symbol to find the identifier for.

    Returns:
        str: The identified gene identifier.
    """
    if gene_symbol in self.hgnc_data.keys():
        return self.hgnc_data[gene_symbol][self.gene_identifier]
    else:
        for _symbol, data in self.hgnc_data.items():
            for prev_symbol in data["previous_symbol"]:
                if prev_symbol == gene_symbol:
                    return data[self.gene_identifier]

obtain_gene_symbol_from_identifier(query_gene_identifier)

Obtain gene symbol from a gene identifier.

Parameters:

Name Type Description Default
query_gene_identifier str

The gene identifier.

required

Returns:

Name Type Description
str str

The gene symbol corresponding to the identifier.

Source code in src/pheval/utils/phenopacket_utils.py
671
672
673
674
675
676
677
678
679
680
681
def obtain_gene_symbol_from_identifier(self, query_gene_identifier: str) -> str:
    """
    Obtain gene symbol from a gene identifier.

    Args:
        query_gene_identifier (str): The gene identifier.

    Returns:
        str: The gene symbol corresponding to the identifier.
    """
    return self.identifier_map[query_gene_identifier]

update_genomic_interpretations_gene_identifier(interpretations, phenopacket_path)

Update the genomic interpretations of a Phenopacket.

Parameters:

Name Type Description Default
interpretations List[Interpretation]

List of Interpretation objects.

required

Returns:

Type Description
List[Interpretation]

List[Interpretation]: Updated list of Interpretation objects.

Source code in src/pheval/utils/phenopacket_utils.py
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
def update_genomic_interpretations_gene_identifier(
    self, interpretations: List[Interpretation], phenopacket_path: Path
) -> List[Interpretation]:
    """
    Update the genomic interpretations of a Phenopacket.

    Args:
        interpretations (List[Interpretation]): List of Interpretation objects.

    Returns:
        List[Interpretation]: Updated list of Interpretation objects.
    """
    updated_interpretations = copy(list(interpretations))
    for updated_interpretation in updated_interpretations:
        for g in updated_interpretation.diagnosis.genomic_interpretations:
            updated_gene_identifier = self.find_identifier(
                g.variant_interpretation.variation_descriptor.gene_context.symbol
            )
            info_log.info(
                f"Updating gene identifier in {phenopacket_path} from "
                f"{g.variant_interpretation.variation_descriptor.gene_context.value_id}"
                f"to {updated_gene_identifier}"
            )
            g.variant_interpretation.variation_descriptor.gene_context.value_id = (
                updated_gene_identifier
            )
            del g.variant_interpretation.variation_descriptor.gene_context.alternate_ids[:]
            g.variant_interpretation.variation_descriptor.gene_context.alternate_ids.extend(
                self._find_alternate_ids(
                    g.variant_interpretation.variation_descriptor.gene_context.symbol
                )
            )
    return updated_interpretations

GenomicVariant dataclass

Represents a genomic variant.

Parameters:

Name Type Description Default
chrom str

The chromosome position of the variant recommended to be provided in the following format.

required
pos int

Position of the variant following VCF convention.

required
ref str

Reference allele following VCF convention.

required
alt str

Alternate allele following VCF convention.

required
Source code in src/pheval/utils/phenopacket_utils.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
@dataclass
class GenomicVariant:
    """
    Represents a genomic variant.

    Args:
        chrom (str): The chromosome position of the variant recommended to be provided in the following format.
        This includes numerical designations from 1 to 22 representing autosomal chromosomes,
        as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT.
        pos (int): Position of the variant following VCF convention.
        ref (str): Reference allele following VCF convention.
        alt (str): Alternate allele following VCF convention.
    """

    chrom: str
    pos: int
    ref: str
    alt: str

IncompatibleGenomeAssemblyError

Bases: Exception

Exception raised for incompatible genome assembly.

Source code in src/pheval/utils/phenopacket_utils.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class IncompatibleGenomeAssemblyError(Exception):
    """Exception raised for incompatible genome assembly."""

    def __init__(self, assembly, phenopacket, message="Incompatible Genome Assembly"):
        """
        Initialise IncompatibleGenomeAssemblyError.

        Attributes:
           assembly (str): Incompatible genome assembly encountered.
           phenopacket (Path): Path to the Phenopacket associated with the error.
           message (str, optional): Custom error message (default is "Incompatible Genome Assembly").
        """
        self.assembly: str = assembly
        self.phenopacket: Path = phenopacket
        self.message: str = message
        super().__init__(self.message)

    def __str__(self):
        return f"{self.message} -> {self.assembly} in {self.phenopacket}"

__init__(assembly, phenopacket, message='Incompatible Genome Assembly')

Initialise IncompatibleGenomeAssemblyError.

Attributes:

Name Type Description
assembly str

Incompatible genome assembly encountered.

phenopacket Path

Path to the Phenopacket associated with the error.

message str

Custom error message (default is "Incompatible Genome Assembly").

Source code in src/pheval/utils/phenopacket_utils.py
30
31
32
33
34
35
36
37
38
39
40
41
42
def __init__(self, assembly, phenopacket, message="Incompatible Genome Assembly"):
    """
    Initialise IncompatibleGenomeAssemblyError.

    Attributes:
       assembly (str): Incompatible genome assembly encountered.
       phenopacket (Path): Path to the Phenopacket associated with the error.
       message (str, optional): Custom error message (default is "Incompatible Genome Assembly").
    """
    self.assembly: str = assembly
    self.phenopacket: Path = phenopacket
    self.message: str = message
    super().__init__(self.message)

PhenopacketRebuilder

Class for rebuilding a Phenopacket

Source code in src/pheval/utils/phenopacket_utils.py
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
class PhenopacketRebuilder:
    """Class for rebuilding a Phenopacket"""

    def __init__(self, phenopacket: Union[Phenopacket, Family]):
        """Initialise PhenopacketUtil

        Attributes:
            phenopacket (Union[Phenopacket, Family]): Phenopacket or Family object
        """
        self.phenopacket = phenopacket

    def update_interpretations(
        self, interpretations: [Interpretation]
    ) -> Union[Phenopacket, Family]:
        """
        Add the updated interpretations to a Phenopacket or Family.

        Args:
            interpretations (List[Interpretation]): The updated interpretations to be added.

        Returns:
            Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations.
        """
        phenopacket = copy(self.phenopacket)
        if hasattr(phenopacket, "proband"):
            del phenopacket.proband.interpretations[:]
            phenopacket.proband.interpretations.extend(interpretations)
        else:
            del phenopacket.interpretations[:]
            phenopacket.interpretations.extend(interpretations)
        return phenopacket

    def add_randomised_hpo(self, randomised_hpo: [PhenotypicFeature]) -> Union[Phenopacket, Family]:
        """
        Add randomised phenotypic profiles to a Phenopacket or Family.

        Args:
            randomised_hpo: The randomised phenotypic profiles to be added.

        Returns:
            Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles.
        """
        phenopacket = copy(self.phenopacket)
        if hasattr(phenopacket, "proband"):
            del phenopacket.proband.phenotypic_features[:]
            phenopacket.proband.phenotypic_features.extend(randomised_hpo)
        else:
            del phenopacket.phenotypic_features[:]
            phenopacket.phenotypic_features.extend(randomised_hpo)
        return phenopacket

    def add_spiked_vcf_path(self, spiked_vcf_file_data: File) -> Union[Phenopacket, Family]:
        """
        Add a spiked VCF path to a Phenopacket or Family.

        Args:
        - spiked_vcf_file_data (File): The VCF file data to be added.

        Returns:
        - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path.
        """
        phenopacket = copy(self.phenopacket)
        phenopacket_files = [
            file for file in phenopacket.files if file.file_attributes["fileFormat"] != "vcf"
        ]
        phenopacket_files.append(spiked_vcf_file_data)
        del phenopacket.files[:]
        phenopacket.files.extend(phenopacket_files)
        return phenopacket

__init__(phenopacket)

Initialise PhenopacketUtil

Attributes:

Name Type Description
phenopacket Union[Phenopacket, Family]

Phenopacket or Family object

Source code in src/pheval/utils/phenopacket_utils.py
538
539
540
541
542
543
544
def __init__(self, phenopacket: Union[Phenopacket, Family]):
    """Initialise PhenopacketUtil

    Attributes:
        phenopacket (Union[Phenopacket, Family]): Phenopacket or Family object
    """
    self.phenopacket = phenopacket

add_randomised_hpo(randomised_hpo)

Add randomised phenotypic profiles to a Phenopacket or Family.

Parameters:

Name Type Description Default
randomised_hpo [PhenotypicFeature]

The randomised phenotypic profiles to be added.

required

Returns:

Type Description
Union[Phenopacket, Family]

Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles.

Source code in src/pheval/utils/phenopacket_utils.py
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
def add_randomised_hpo(self, randomised_hpo: [PhenotypicFeature]) -> Union[Phenopacket, Family]:
    """
    Add randomised phenotypic profiles to a Phenopacket or Family.

    Args:
        randomised_hpo: The randomised phenotypic profiles to be added.

    Returns:
        Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles.
    """
    phenopacket = copy(self.phenopacket)
    if hasattr(phenopacket, "proband"):
        del phenopacket.proband.phenotypic_features[:]
        phenopacket.proband.phenotypic_features.extend(randomised_hpo)
    else:
        del phenopacket.phenotypic_features[:]
        phenopacket.phenotypic_features.extend(randomised_hpo)
    return phenopacket

add_spiked_vcf_path(spiked_vcf_file_data)

Add a spiked VCF path to a Phenopacket or Family.

  • spiked_vcf_file_data (File): The VCF file data to be added.
  • Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path.
Source code in src/pheval/utils/phenopacket_utils.py
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
def add_spiked_vcf_path(self, spiked_vcf_file_data: File) -> Union[Phenopacket, Family]:
    """
    Add a spiked VCF path to a Phenopacket or Family.

    Args:
    - spiked_vcf_file_data (File): The VCF file data to be added.

    Returns:
    - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path.
    """
    phenopacket = copy(self.phenopacket)
    phenopacket_files = [
        file for file in phenopacket.files if file.file_attributes["fileFormat"] != "vcf"
    ]
    phenopacket_files.append(spiked_vcf_file_data)
    del phenopacket.files[:]
    phenopacket.files.extend(phenopacket_files)
    return phenopacket

update_interpretations(interpretations)

Add the updated interpretations to a Phenopacket or Family.

Parameters:

Name Type Description Default
interpretations List[Interpretation]

The updated interpretations to be added.

required

Returns:

Type Description
Union[Phenopacket, Family]

Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations.

Source code in src/pheval/utils/phenopacket_utils.py
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
def update_interpretations(
    self, interpretations: [Interpretation]
) -> Union[Phenopacket, Family]:
    """
    Add the updated interpretations to a Phenopacket or Family.

    Args:
        interpretations (List[Interpretation]): The updated interpretations to be added.

    Returns:
        Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations.
    """
    phenopacket = copy(self.phenopacket)
    if hasattr(phenopacket, "proband"):
        del phenopacket.proband.interpretations[:]
        phenopacket.proband.interpretations.extend(interpretations)
    else:
        del phenopacket.interpretations[:]
        phenopacket.interpretations.extend(interpretations)
    return phenopacket

PhenopacketUtil

Class for retrieving data from a Phenopacket or Family object

Source code in src/pheval/utils/phenopacket_utils.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
class PhenopacketUtil:
    """Class for retrieving data from a Phenopacket or Family object"""

    def __init__(self, phenopacket_contents: Union[Phenopacket, Family]):
        """Initialise PhenopacketUtil

        Args:
            phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object
        """
        self.phenopacket_contents = phenopacket_contents

    def sample_id(self) -> str:
        """
        Retrieve the sample ID from a Phenopacket or proband of a Family

        Returns:
            str: Sample ID
        """
        if hasattr(self.phenopacket_contents, "proband"):
            return self.phenopacket_contents.proband.subject.id
        else:
            return self.phenopacket_contents.subject.id

    def phenotypic_features(self) -> List[PhenotypicFeature]:
        """
        Retrieve a list of all HPO terms

        Returns:
            List[PhenotypicFeature]: List of HPO terms
        """
        if hasattr(self.phenopacket_contents, "proband"):
            return self.phenopacket_contents.proband.phenotypic_features
        else:
            return self.phenopacket_contents.phenotypic_features

    def observed_phenotypic_features(self) -> List[PhenotypicFeature]:
        """
        Retrieve a list of all observed HPO terms

        Returns:
            List[PhenotypicFeature]: List of observed HPO terms
        """
        phenotypic_features = []
        all_phenotypic_features = self.phenotypic_features()
        for p in all_phenotypic_features:
            if p.excluded:
                continue
            phenotypic_features.append(p)
        return phenotypic_features

    def negated_phenotypic_features(self) -> List[PhenotypicFeature]:
        """
        Retrieve a list of all negated HPO terms

        Returns:
            List[PhenotypicFeature]: List of negated HPO terms
        """
        negated_phenotypic_features = []
        all_phenotypic_features = self.phenotypic_features()
        for p in all_phenotypic_features:
            if p.excluded:
                negated_phenotypic_features.append(p)
        return negated_phenotypic_features

    def diseases(self) -> List[Disease]:
        """
        Retrieve a list of Diseases associated with the proband

        Returns:
            List[Disease]: List of diseases
        """
        if hasattr(self.phenopacket_contents, "proband"):
            return self.phenopacket_contents.proband.diseases
        else:
            return self.phenopacket_contents.diseases

    def _diagnosis_from_interpretations(self) -> List[ProbandDisease]:
        """
        Retrieve a list of disease diagnoses associated with the proband from the interpretations object

        Returns:
            List[ProbandDisease]: List of diagnosed diseases
        """
        diagnoses = []
        interpretation = self.interpretations()
        for i in interpretation:
            (
                diagnoses.append(
                    ProbandDisease(
                        disease_name=i.diagnosis.disease.label,
                        disease_identifier=i.diagnosis.disease.id,
                    )
                )
                if i.diagnosis.disease.label != "" and i.diagnosis.disease.id != ""
                else None
            )
        return diagnoses

    def _diagnosis_from_disease(self) -> List[ProbandDisease]:
        """
        Retrieve a list of disease diagnoses associated with the proband from the diseases object

        Returns:
            List[ProbandDisease]: List of diagnosed diseases
        """
        diagnoses = []
        for disease in self.diseases():
            diagnoses.append(
                ProbandDisease(disease_name=disease.term.label, disease_identifier=disease.term.id)
            )
        return diagnoses

    def diagnoses(self) -> List[ProbandDisease]:
        """
        Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket

        Returns:
            List[ProbandDisease]: List of diagnosed diseases
        """
        return list(set(self._diagnosis_from_interpretations() + self._diagnosis_from_disease()))

    def interpretations(self) -> List[Interpretation]:
        """
        Retrieve a list of interpretations from a Phenopacket

        Returns:
            List[Interpretation]: List of interpretations
        """
        if hasattr(self.phenopacket_contents, "proband"):
            return self.phenopacket_contents.proband.interpretations
        else:
            return self.phenopacket_contents.interpretations

    def causative_variants(self) -> List[ProbandCausativeVariant]:
        """
        Retrieve a list of causative variants listed in a Phenopacket

        Returns:
            List[ProbandCausativeVariant]: List of proband causative variants
        """
        all_variants = []
        interpretation = self.interpretations()
        for i in interpretation:
            for g in i.diagnosis.genomic_interpretations:
                vcf_record = g.variant_interpretation.variation_descriptor.vcf_record
                genotype = g.variant_interpretation.variation_descriptor.allelic_state
                variant_data = ProbandCausativeVariant(
                    self.phenopacket_contents.subject.id,
                    vcf_record.genome_assembly,
                    GenomicVariant(
                        vcf_record.chrom,
                        vcf_record.pos,
                        vcf_record.ref,
                        vcf_record.alt,
                    ),
                    genotype.label,
                    vcf_record.info,
                )
                all_variants.append(variant_data)
        return all_variants

    def files(self) -> List[File]:
        """
        Retrieve a list of files associated with a phenopacket

        Returns:
            List[File]: List of files associated with a phenopacket
        """
        return self.phenopacket_contents.files

    def vcf_file_data(self, phenopacket_path: Path, vcf_dir: Path) -> File:
        """
        Retrieve the genome assembly and VCF file name from a phenopacket.

        Args:
            phenopacket_path (Path): The path to the phenopacket file.
            vcf_dir (Path): The directory path where the VCF file is stored.

        Returns:
            File: The VCF file with updated URI pointing to the specified directory.

        Raises:
            IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format.
            IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible.

        Note:
            This function searches for a VCF file within the provided list of files, validates its format,
            and checks if the genome assembly is compatible. If the conditions are met, it updates the
            URI of the VCF file to the specified directory and returns the modified file object.
        """
        compatible_genome_assembly = ["GRCh37", "hg19", "GRCh38", "hg38"]
        vcf_data = [file for file in self.files() if file.file_attributes["fileFormat"] == "vcf"][0]
        if not Path(vcf_data.uri).name.endswith(".vcf") and not Path(vcf_data.uri).name.endswith(
            ".vcf.gz"
        ):
            raise IncorrectFileFormatError(Path(vcf_data.uri), ".vcf or .vcf.gz file")
        if vcf_data.file_attributes["genomeAssembly"] not in compatible_genome_assembly:
            raise IncompatibleGenomeAssemblyError(
                vcf_data.file_attributes["genomeAssembly"], phenopacket_path
            )
        vcf_data.uri = str(vcf_dir.joinpath(Path(vcf_data.uri).name))
        return vcf_data

    @staticmethod
    def _extract_diagnosed_gene(
        genomic_interpretation: GenomicInterpretation,
    ) -> ProbandCausativeGene:
        """
        Retrieve the disease causing genes from the variant descriptor field if not empty,
        otherwise, retrieves from the gene descriptor from a phenopacket.
        Args:
            genomic_interpretation (GenomicInterpretation): A genomic interpretation from a Phenopacket
        Returns:
            ProbandCausativeGene: The disease causing gene
        """
        if genomic_interpretation.variant_interpretation.ByteSize() != 0:
            return ProbandCausativeGene(
                genomic_interpretation.variant_interpretation.variation_descriptor.gene_context.symbol,
                genomic_interpretation.variant_interpretation.variation_descriptor.gene_context.value_id,
            )

        else:
            return ProbandCausativeGene(
                gene_symbol=genomic_interpretation.gene.symbol,
                gene_identifier=genomic_interpretation.gene.value_id,
            )

    def diagnosed_genes(self) -> List[ProbandCausativeGene]:
        """
        Retrieve the disease causing genes from a phenopacket.
        Returns:
            List[ProbandCausativeGene]: List of causative genes
        """
        pheno_interpretation = self.interpretations()
        genes = []
        for i in pheno_interpretation:
            for g in i.diagnosis.genomic_interpretations:
                genes.append(self._extract_diagnosed_gene(g))
                genes = list({gene.gene_symbol: gene for gene in genes}.values())
        return genes

    def diagnosed_variants(self) -> List[GenomicVariant]:
        """
        Retrieve a list of all known causative variants from a phenopacket.
        Returns:
            List[GenomicVariant]: List of causative variants
        """
        variants = []
        pheno_interpretation = self.interpretations()
        for i in pheno_interpretation:
            for g in i.diagnosis.genomic_interpretations:
                variant = GenomicVariant(
                    chrom=g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace(
                        "chr", ""
                    ),
                    pos=g.variant_interpretation.variation_descriptor.vcf_record.pos,
                    ref=g.variant_interpretation.variation_descriptor.vcf_record.ref,
                    alt=g.variant_interpretation.variation_descriptor.vcf_record.alt,
                )
                variants.append(variant)
        return variants

    def check_incomplete_variant_record(self) -> bool:
        """
        Check if any variant record in the phenopacket has incomplete information.

        This method iterates through the diagnosed variant records and checks if any of them
        have missing or incomplete information such as empty chromosome, position, reference,
        or alternate allele.

        Returns:
            bool: True if any variant record is incomplete, False otherwise.
        """
        variants = self.diagnosed_variants()
        for variant in variants:
            if (
                variant.chrom == ""
                or variant.pos == 0
                or variant.pos == ""
                or variant.ref == ""
                or variant.alt == ""
            ):
                return True
        return False

    def check_incomplete_gene_record(self) -> bool:
        """
        Check if any gene record in the phenopacket has incomplete information.

        This method iterates through the diagnosed gene records and checks if any of them
        have missing or incomplete information such as gene name, or gene identifier.

        Returns:
            bool: True if any gene record is incomplete, False otherwise.
        """
        genes = self.diagnosed_genes()
        for gene in genes:
            if gene.gene_symbol == "" or gene.gene_identifier == "":
                return True
        return False

    def check_incomplete_disease_record(self) -> bool:
        """
        Check if any disease record in the phenopacket has incomplete information.

        This method iterates through the diagnosed disease records and checks if any of them
        have missing or incomplete information such as empty disease name, or disease identifier.

        Returns:
            bool: True if any disease record is incomplete, False otherwise.
        """
        if len(self.diagnoses()) == 0:
            return True
        return False

__init__(phenopacket_contents)

Initialise PhenopacketUtil

Parameters:

Name Type Description Default
phenopacket_contents Union[Phenopacket, Family]

Phenopacket or Family object

required
Source code in src/pheval/utils/phenopacket_utils.py
222
223
224
225
226
227
228
def __init__(self, phenopacket_contents: Union[Phenopacket, Family]):
    """Initialise PhenopacketUtil

    Args:
        phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object
    """
    self.phenopacket_contents = phenopacket_contents

causative_variants()

Retrieve a list of causative variants listed in a Phenopacket

Returns:

Type Description
List[ProbandCausativeVariant]

List[ProbandCausativeVariant]: List of proband causative variants

Source code in src/pheval/utils/phenopacket_utils.py
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
def causative_variants(self) -> List[ProbandCausativeVariant]:
    """
    Retrieve a list of causative variants listed in a Phenopacket

    Returns:
        List[ProbandCausativeVariant]: List of proband causative variants
    """
    all_variants = []
    interpretation = self.interpretations()
    for i in interpretation:
        for g in i.diagnosis.genomic_interpretations:
            vcf_record = g.variant_interpretation.variation_descriptor.vcf_record
            genotype = g.variant_interpretation.variation_descriptor.allelic_state
            variant_data = ProbandCausativeVariant(
                self.phenopacket_contents.subject.id,
                vcf_record.genome_assembly,
                GenomicVariant(
                    vcf_record.chrom,
                    vcf_record.pos,
                    vcf_record.ref,
                    vcf_record.alt,
                ),
                genotype.label,
                vcf_record.info,
            )
            all_variants.append(variant_data)
    return all_variants

check_incomplete_disease_record()

Check if any disease record in the phenopacket has incomplete information.

This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier.

Returns:

Name Type Description
bool bool

True if any disease record is incomplete, False otherwise.

Source code in src/pheval/utils/phenopacket_utils.py
520
521
522
523
524
525
526
527
528
529
530
531
532
def check_incomplete_disease_record(self) -> bool:
    """
    Check if any disease record in the phenopacket has incomplete information.

    This method iterates through the diagnosed disease records and checks if any of them
    have missing or incomplete information such as empty disease name, or disease identifier.

    Returns:
        bool: True if any disease record is incomplete, False otherwise.
    """
    if len(self.diagnoses()) == 0:
        return True
    return False

check_incomplete_gene_record()

Check if any gene record in the phenopacket has incomplete information.

This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier.

Returns:

Name Type Description
bool bool

True if any gene record is incomplete, False otherwise.

Source code in src/pheval/utils/phenopacket_utils.py
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
def check_incomplete_gene_record(self) -> bool:
    """
    Check if any gene record in the phenopacket has incomplete information.

    This method iterates through the diagnosed gene records and checks if any of them
    have missing or incomplete information such as gene name, or gene identifier.

    Returns:
        bool: True if any gene record is incomplete, False otherwise.
    """
    genes = self.diagnosed_genes()
    for gene in genes:
        if gene.gene_symbol == "" or gene.gene_identifier == "":
            return True
    return False

check_incomplete_variant_record()

Check if any variant record in the phenopacket has incomplete information.

This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele.

Returns:

Name Type Description
bool bool

True if any variant record is incomplete, False otherwise.

Source code in src/pheval/utils/phenopacket_utils.py
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
def check_incomplete_variant_record(self) -> bool:
    """
    Check if any variant record in the phenopacket has incomplete information.

    This method iterates through the diagnosed variant records and checks if any of them
    have missing or incomplete information such as empty chromosome, position, reference,
    or alternate allele.

    Returns:
        bool: True if any variant record is incomplete, False otherwise.
    """
    variants = self.diagnosed_variants()
    for variant in variants:
        if (
            variant.chrom == ""
            or variant.pos == 0
            or variant.pos == ""
            or variant.ref == ""
            or variant.alt == ""
        ):
            return True
    return False

diagnosed_genes()

Retrieve the disease causing genes from a phenopacket.

Returns:

Type Description
List[ProbandCausativeGene]

List[ProbandCausativeGene]: List of causative genes

Source code in src/pheval/utils/phenopacket_utils.py
446
447
448
449
450
451
452
453
454
455
456
457
458
def diagnosed_genes(self) -> List[ProbandCausativeGene]:
    """
    Retrieve the disease causing genes from a phenopacket.
    Returns:
        List[ProbandCausativeGene]: List of causative genes
    """
    pheno_interpretation = self.interpretations()
    genes = []
    for i in pheno_interpretation:
        for g in i.diagnosis.genomic_interpretations:
            genes.append(self._extract_diagnosed_gene(g))
            genes = list({gene.gene_symbol: gene for gene in genes}.values())
    return genes

diagnosed_variants()

Retrieve a list of all known causative variants from a phenopacket.

Returns:

Type Description
List[GenomicVariant]

List[GenomicVariant]: List of causative variants

Source code in src/pheval/utils/phenopacket_utils.py
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
def diagnosed_variants(self) -> List[GenomicVariant]:
    """
    Retrieve a list of all known causative variants from a phenopacket.
    Returns:
        List[GenomicVariant]: List of causative variants
    """
    variants = []
    pheno_interpretation = self.interpretations()
    for i in pheno_interpretation:
        for g in i.diagnosis.genomic_interpretations:
            variant = GenomicVariant(
                chrom=g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace(
                    "chr", ""
                ),
                pos=g.variant_interpretation.variation_descriptor.vcf_record.pos,
                ref=g.variant_interpretation.variation_descriptor.vcf_record.ref,
                alt=g.variant_interpretation.variation_descriptor.vcf_record.alt,
            )
            variants.append(variant)
    return variants

diagnoses()

Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket

Returns:

Type Description
List[ProbandDisease]

List[ProbandDisease]: List of diagnosed diseases

Source code in src/pheval/utils/phenopacket_utils.py
331
332
333
334
335
336
337
338
def diagnoses(self) -> List[ProbandDisease]:
    """
    Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket

    Returns:
        List[ProbandDisease]: List of diagnosed diseases
    """
    return list(set(self._diagnosis_from_interpretations() + self._diagnosis_from_disease()))

diseases()

Retrieve a list of Diseases associated with the proband

Returns:

Type Description
List[Disease]

List[Disease]: List of diseases

Source code in src/pheval/utils/phenopacket_utils.py
283
284
285
286
287
288
289
290
291
292
293
def diseases(self) -> List[Disease]:
    """
    Retrieve a list of Diseases associated with the proband

    Returns:
        List[Disease]: List of diseases
    """
    if hasattr(self.phenopacket_contents, "proband"):
        return self.phenopacket_contents.proband.diseases
    else:
        return self.phenopacket_contents.diseases

files()

Retrieve a list of files associated with a phenopacket

Returns:

Type Description
List[File]

List[File]: List of files associated with a phenopacket

Source code in src/pheval/utils/phenopacket_utils.py
380
381
382
383
384
385
386
387
def files(self) -> List[File]:
    """
    Retrieve a list of files associated with a phenopacket

    Returns:
        List[File]: List of files associated with a phenopacket
    """
    return self.phenopacket_contents.files

interpretations()

Retrieve a list of interpretations from a Phenopacket

Returns:

Type Description
List[Interpretation]

List[Interpretation]: List of interpretations

Source code in src/pheval/utils/phenopacket_utils.py
340
341
342
343
344
345
346
347
348
349
350
def interpretations(self) -> List[Interpretation]:
    """
    Retrieve a list of interpretations from a Phenopacket

    Returns:
        List[Interpretation]: List of interpretations
    """
    if hasattr(self.phenopacket_contents, "proband"):
        return self.phenopacket_contents.proband.interpretations
    else:
        return self.phenopacket_contents.interpretations

negated_phenotypic_features()

Retrieve a list of all negated HPO terms

Returns:

Type Description
List[PhenotypicFeature]

List[PhenotypicFeature]: List of negated HPO terms

Source code in src/pheval/utils/phenopacket_utils.py
269
270
271
272
273
274
275
276
277
278
279
280
281
def negated_phenotypic_features(self) -> List[PhenotypicFeature]:
    """
    Retrieve a list of all negated HPO terms

    Returns:
        List[PhenotypicFeature]: List of negated HPO terms
    """
    negated_phenotypic_features = []
    all_phenotypic_features = self.phenotypic_features()
    for p in all_phenotypic_features:
        if p.excluded:
            negated_phenotypic_features.append(p)
    return negated_phenotypic_features

observed_phenotypic_features()

Retrieve a list of all observed HPO terms

Returns:

Type Description
List[PhenotypicFeature]

List[PhenotypicFeature]: List of observed HPO terms

Source code in src/pheval/utils/phenopacket_utils.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
def observed_phenotypic_features(self) -> List[PhenotypicFeature]:
    """
    Retrieve a list of all observed HPO terms

    Returns:
        List[PhenotypicFeature]: List of observed HPO terms
    """
    phenotypic_features = []
    all_phenotypic_features = self.phenotypic_features()
    for p in all_phenotypic_features:
        if p.excluded:
            continue
        phenotypic_features.append(p)
    return phenotypic_features

phenotypic_features()

Retrieve a list of all HPO terms

Returns:

Type Description
List[PhenotypicFeature]

List[PhenotypicFeature]: List of HPO terms

Source code in src/pheval/utils/phenopacket_utils.py
242
243
244
245
246
247
248
249
250
251
252
def phenotypic_features(self) -> List[PhenotypicFeature]:
    """
    Retrieve a list of all HPO terms

    Returns:
        List[PhenotypicFeature]: List of HPO terms
    """
    if hasattr(self.phenopacket_contents, "proband"):
        return self.phenopacket_contents.proband.phenotypic_features
    else:
        return self.phenopacket_contents.phenotypic_features

sample_id()

Retrieve the sample ID from a Phenopacket or proband of a Family

Returns:

Name Type Description
str str

Sample ID

Source code in src/pheval/utils/phenopacket_utils.py
230
231
232
233
234
235
236
237
238
239
240
def sample_id(self) -> str:
    """
    Retrieve the sample ID from a Phenopacket or proband of a Family

    Returns:
        str: Sample ID
    """
    if hasattr(self.phenopacket_contents, "proband"):
        return self.phenopacket_contents.proband.subject.id
    else:
        return self.phenopacket_contents.subject.id

vcf_file_data(phenopacket_path, vcf_dir)

Retrieve the genome assembly and VCF file name from a phenopacket.

Parameters:

Name Type Description Default
phenopacket_path Path

The path to the phenopacket file.

required
vcf_dir Path

The directory path where the VCF file is stored.

required

Returns:

Name Type Description
File File

The VCF file with updated URI pointing to the specified directory.

Raises:

Type Description
IncorrectFileFormatError

If the provided file is not in .vcf or .vcf.gz format.

IncompatibleGenomeAssemblyError

If the genome assembly of the VCF file is not compatible.

Note

This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object.

Source code in src/pheval/utils/phenopacket_utils.py
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
def vcf_file_data(self, phenopacket_path: Path, vcf_dir: Path) -> File:
    """
    Retrieve the genome assembly and VCF file name from a phenopacket.

    Args:
        phenopacket_path (Path): The path to the phenopacket file.
        vcf_dir (Path): The directory path where the VCF file is stored.

    Returns:
        File: The VCF file with updated URI pointing to the specified directory.

    Raises:
        IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format.
        IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible.

    Note:
        This function searches for a VCF file within the provided list of files, validates its format,
        and checks if the genome assembly is compatible. If the conditions are met, it updates the
        URI of the VCF file to the specified directory and returns the modified file object.
    """
    compatible_genome_assembly = ["GRCh37", "hg19", "GRCh38", "hg38"]
    vcf_data = [file for file in self.files() if file.file_attributes["fileFormat"] == "vcf"][0]
    if not Path(vcf_data.uri).name.endswith(".vcf") and not Path(vcf_data.uri).name.endswith(
        ".vcf.gz"
    ):
        raise IncorrectFileFormatError(Path(vcf_data.uri), ".vcf or .vcf.gz file")
    if vcf_data.file_attributes["genomeAssembly"] not in compatible_genome_assembly:
        raise IncompatibleGenomeAssemblyError(
            vcf_data.file_attributes["genomeAssembly"], phenopacket_path
        )
    vcf_data.uri = str(vcf_dir.joinpath(Path(vcf_data.uri).name))
    return vcf_data

ProbandCausativeGene dataclass

Represents a causative gene associated with a proband

Parameters:

Name Type Description Default
gene_symbol str

Symbol representing the gene

required
gene_identifier str

The ENSEMBL gene identifier for the result entry

required
Notes

While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis.

Source code in src/pheval/utils/phenopacket_utils.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
@dataclass
class ProbandCausativeGene:
    """
    Represents a causative gene associated with a proband

    Args:
        gene_symbol (str): Symbol representing the gene
        gene_identifier (str): The ENSEMBL gene identifier for the result entry
    Notes:
         While we recommend providing the gene identifier in the ENSEMBL namespace,
         any matching format used in Phenopacket interpretations and result output is acceptable
         for result matching purposes in the analysis.
    """

    gene_symbol: str
    gene_identifier: str

ProbandCausativeVariant dataclass

Represents a causative variant associated with a proband

Parameters:

Name Type Description Default
proband_id str

ID of the proband

required
assembly str

Genome assembly

required
variant GenomicVariant

Genomic variant associated with the proband

required
genotype str

Genotype information for the variant

required
info str

Additional information about the variant (default is an empty string)

''
Source code in src/pheval/utils/phenopacket_utils.py
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
@dataclass
class ProbandCausativeVariant:
    """
    Represents a causative variant associated with a proband

    Args:
        proband_id (str): ID of the proband
        assembly (str): Genome assembly
        variant (GenomicVariant): Genomic variant associated with the proband
        genotype (str): Genotype information for the variant
        info (str, optional): Additional information about the variant (default is an empty string)
    """

    proband_id: str
    assembly: str
    variant: GenomicVariant
    genotype: str
    info: str = ""

ProbandDisease dataclass

Represents a disease associated with a proband

Parameters:

Name Type Description Default
disease_name str

Name of the disease

required
disease_identifier str

Identifier for the disease result entry in the OMIM namespace

required
Notes

While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis.

Source code in src/pheval/utils/phenopacket_utils.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
@dataclass(frozen=True, eq=True)
class ProbandDisease:
    """
    Represents a disease associated with a proband

    Args:
        disease_name (str): Name of the disease
        disease_identifier (str): Identifier for the disease result entry in the OMIM namespace

    Notes:
         While we recommend providing the disease identifier in the OMIM namespace,
         any matching format used in Phenopacket interpretations and result output is acceptable
         for result matching purposes in the analysis.
    """

    disease_name: str
    disease_identifier: str

create_gene_identifier_map()

Create a mapping of gene identifiers to gene symbols using HGNC data.

Returns:

Name Type Description
dict dict

A mapping of gene identifiers to gene symbols.

Notes

The dictionary structure: { 'identifier': 'gene_symbol', ... }

Source code in src/pheval/utils/phenopacket_utils.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def create_gene_identifier_map() -> dict:
    """
    Create a mapping of gene identifiers to gene symbols using HGNC data.

    Returns:
        dict: A mapping of gene identifiers to gene symbols.

    Notes:
        The dictionary structure:
        {
            'identifier': 'gene_symbol',
            ...
        }
    """
    hgnc_df = read_hgnc_data()
    identifier_map = {}
    for _index, row in hgnc_df.iterrows():
        identifier_map[row["ensembl_gene_id"]] = row["symbol"]
        identifier_map[row["hgnc_id"]] = row["symbol"]
        identifier_map[row["entrez_id"]] = row["symbol"]
        identifier_map[row["refseq_accession"]] = row["symbol"]
    return identifier_map

create_hgnc_dict()

Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data.

Returns:

Name Type Description
defaultdict defaultdict

A dictionary containing gene symbols as keys and their associated gene information.

Notes

The dictionary structure: { 'gene_symbol': { 'ensembl_id': str, 'hgnc_id': str, 'entrez_id': str, 'refseq_accession': str, 'previous_symbol': [str, ...] }, ... }

Source code in src/pheval/utils/phenopacket_utils.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def create_hgnc_dict() -> defaultdict:
    """
    Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data.


    Returns:
        defaultdict: A dictionary containing gene symbols as keys and their associated gene information.

    Notes:
        The dictionary structure:
        {
            'gene_symbol': {
                'ensembl_id': str,
                'hgnc_id': str,
                'entrez_id': str,
                'refseq_accession': str,
                'previous_symbol': [str, ...]
            },
            ...
        }
    """
    hgnc_df = read_hgnc_data()
    hgnc_data = defaultdict(dict)
    for _index, row in hgnc_df.iterrows():
        previous_names = []
        hgnc_data[row["symbol"]]["ensembl_id"] = row["ensembl_gene_id"]
        hgnc_data[row["symbol"]]["hgnc_id"] = row["hgnc_id"]
        hgnc_data[row["symbol"]]["entrez_id"] = row["entrez_id"]
        hgnc_data[row["symbol"]]["refseq_accession"] = row["refseq_accession"]
        previous = str(row["prev_symbol"]).split("|")
        for p in previous:
            previous_names.append(p.strip('"'))
        hgnc_data[row["symbol"]]["previous_symbol"] = previous_names

    return hgnc_data

create_json_message(phenopacket)

Create a JSON message for writing to a file.

  • phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family object to convert to JSON.
  • str: A JSON-formatted string representation of the Phenopacket or Family object.
Source code in src/pheval/utils/phenopacket_utils.py
606
607
608
609
610
611
612
613
614
615
616
def create_json_message(phenopacket: Union[Phenopacket, Family]) -> str:
    """
    Create a JSON message for writing to a file.

    Args:
    - phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family object to convert to JSON.

    Returns:
    - str: A JSON-formatted string representation of the Phenopacket or Family object.
    """
    return MessageToJson(phenopacket)

phenopacket_reader(file)

Read a Phenopacket file and returns its contents as a Phenopacket or Family object

Parameters:

Name Type Description Default
file Path

Path to the Phenopacket file

required

Returns:

Type Description
Union[Phenopacket, Family]

Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object

Source code in src/pheval/utils/phenopacket_utils.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def phenopacket_reader(file: Path) -> Union[Phenopacket, Family]:
    """
    Read a Phenopacket file and returns its contents as a Phenopacket or Family object

    Args:
        file (Path): Path to the Phenopacket file

    Returns:
        Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object
    """
    file = open(file, "r")
    phenopacket = json.load(file)
    file.close()
    if "proband" in phenopacket:
        return Parse(json.dumps(phenopacket), Family())
    else:
        return Parse(json.dumps(phenopacket), Phenopacket())

read_hgnc_data()

Read HGNC data from a file and return it as a Pandas DataFrame.

Returns:

Type Description
pd.DataFrame

pd.DataFrame: DataFrame containing the HGNC data.

Source code in src/pheval/utils/phenopacket_utils.py
125
126
127
128
129
130
131
132
133
134
135
136
def read_hgnc_data() -> pd.DataFrame:
    """
    Read HGNC data from a file and return it as a Pandas DataFrame.

    Returns:
        pd.DataFrame: DataFrame containing the HGNC data.
    """
    return pd.read_csv(
        os.path.dirname(__file__).replace("utils", "resources/hgnc_complete_set.txt"),
        delimiter="\t",
        dtype=str,
    )

write_phenopacket(phenopacket, output_file)

Write a Phenopacket or Family object to a file in JSON format.

Parameters:

Name Type Description Default
phenopacket Phenopacket or Family

The Phenopacket or Family object to be written.

required
output_file Path

The Path object representing the file to write the Phenopacket data.

required

Returns:

Type Description
None

None

Source code in src/pheval/utils/phenopacket_utils.py
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
def write_phenopacket(phenopacket: Union[Phenopacket, Family], output_file: Path) -> None:
    """
    Write a Phenopacket or Family object to a file in JSON format.

    Args:
        phenopacket (Phenopacket or Family): The Phenopacket or Family object to be written.
        output_file (Path): The Path object representing the file to write the Phenopacket data.

    Returns:
        None
    """
    phenopacket_json = create_json_message(phenopacket)
    with open(output_file, "w") as outfile:
        outfile.write(phenopacket_json)
    outfile.close()