Bases: PhenopacketValidator
Validate a list of phenopackets as to whether they have a minunum number of phenotypic features and alleles
The following example shows how to use this class to assess whether each phenopacket in the directory called "phenopackets" contains at least one variant and at least three HPO terms.
from pyphetools.visualization import PhenopacketIngestor
from pyphetools.validation import ContentValidator
ingestor = PhenopacketIngestor(indir="phenopackets")
ppkt_d = ingestor.get_phenopacket_dictionary()
ppkt_list = list(ppkt_d.values())
validator = ContentValidator(min_var=1, min_hpo=3)
errors = validator.validate_phenopacket_list(ppkt_list)
print(f"{len(errors)} errors were identified")
Note that this class does not test for all errors. Use phenopacket-tools to check for redundant or conflicting
annotations.
Parameters:
Name |
Type |
Description |
Default |
min_hpo
|
int
|
minimum number of phenotypic features (HP terms) for this phenopacket to be considered valid
|
required
|
allelic_requirement
|
AllelicRequirement
|
used to check number of alleles and variants
|
None
|
Source code in pyphetools/validation/content_validator.py
| class ContentValidator(PhenopacketValidator):
"""
Validate a list of phenopackets as to whether they have a minunum number of phenotypic features and alleles
The following example shows how to use this class to assess whether each phenopacket in the directory called "phenopackets" contains at least one variant and at least three HPO terms.
from pyphetools.visualization import PhenopacketIngestor
from pyphetools.validation import ContentValidator
ingestor = PhenopacketIngestor(indir="phenopackets")
ppkt_d = ingestor.get_phenopacket_dictionary()
ppkt_list = list(ppkt_d.values())
validator = ContentValidator(min_var=1, min_hpo=3)
errors = validator.validate_phenopacket_list(ppkt_list)
print(f"{len(errors)} errors were identified")
Note that this class does not test for all errors. Use phenopacket-tools to check for redundant or conflicting
annotations.
:param min_hpo: minimum number of phenotypic features (HP terms) for this phenopacket to be considered valid
:type min_hpo: int
:param allelic_requirement: used to check number of alleles and variants
:type allelic_requirement: AllelicRequirement
"""
def __init__(self, min_hpo: int, allelic_requirement: AllelicRequirement = None, minimum_disease_count:int=1) -> None:
super().__init__()
self._min_hpo = min_hpo
self._allelic_requirement = allelic_requirement
self._minimum_disease_count = minimum_disease_count
def validate_individual(self, individual:Individual) -> List[ValidationResult]:
"""
check a single Individual as to whether there are sufficient HPO terms and alleles/variants
:returns: a potential empty list of validations
:rtype: List[ValidationResult]
"""
n_pf = len(individual.hpo_terms)
n_var = 0
n_alleles = 0
pp_id = individual.get_phenopacket_id()
for variant_interpretation in individual.interpretation_list:
n_var += 1
if variant_interpretation.variation_descriptor is not None:
vdesc = variant_interpretation.variation_descriptor
if vdesc.allelic_state is not None:
gtype = vdesc.allelic_state
if gtype.label == "heterozygous": # "GENO:0000135"
n_alleles += 1
elif gtype.label == "homozygous": # "GENO:0000136"
n_alleles += 2
elif gtype.label == "hemizygous": # "GENO:0000134"
n_alleles += 1
disease_count = individual.disease_count()
return self._validate(pp_id=pp_id, n_hpo=n_pf, disease_count=disease_count, n_var=n_var, n_alleles=n_alleles)
def validate_phenopacket(self, phenopacket) -> List[ValidationResult]:
"""
check a single phenopacket as to whether there are sufficient HPO terms and alleles/variants
:returns: a potential empty list of validations
:rtype: List[ValidationResult]
"""
if isinstance(phenopacket, str):
# the user passed a file
if not os.path.isfile(phenopacket):
raise FileNotFoundError(f"Could not find phenopacket file at '{phenopacket}'")
with open(phenopacket) as f:
data = f.read()
jsondata = json.loads(data)
phpacket = Parse(json.dumps(jsondata), phenopackets.Phenopacket())
elif isinstance(phenopacket, phenopackets.Phenopacket):
phpacket = phenopacket
else:
raise ValueError(f"phenopacket argument must be file path or GA4GH Phenopacket \
object but was {type(phenopacket)}")
pp_id = phpacket.id
n_pf = len(phpacket.phenotypic_features)
if phpacket.interpretations is None:
n_var = 0
n_alleles = 0
else:
n_var = 0
n_alleles = 0
for interpretation in phpacket.interpretations:
if interpretation.diagnosis is not None:
dx = interpretation.diagnosis
for genomic_interpretation in dx.genomic_interpretations:
n_var += 1
vint = genomic_interpretation.variant_interpretation
if vint.variation_descriptor is not None:
vdesc = vint.variation_descriptor
if vdesc.allelic_state is not None:
gtype = vdesc.allelic_state
if gtype.label == "heterozygous": # "GENO:0000135"
n_alleles += 1
elif gtype.label == "homozygous": # "GENO:0000136"
n_alleles += 2
elif gtype.label == "hemizygous": # "GENO:0000134"
n_alleles += 1
disease_count = len(phenopacket.diseases)
return self._validate(pp_id=pp_id, n_hpo=n_pf, disease_count=disease_count, n_var=n_var, n_alleles=n_alleles)
def _validate(self, pp_id:str, n_hpo:int, disease_count:int, n_var:int=None, n_alleles:int=None):
"""
private method called by validate_individual or validate_phenopacket.
:param pp_id: phenopacket identifier
:type pp_id: str
:param n_hpo: Number of HPO terms
:type n_hpo: int
:param n_var: Number of variants found
:type n_var: Optional[int]
:param n_alleles: Number of alleles found
:type n_alleles: Optional[int]
"""
validation_results = []
if n_hpo < self._min_hpo:
validation_results.append(ValidationResultBuilder(phenopacket_id=pp_id).insufficient_hpos(min_hpo=self._min_hpo, n_hpo=n_hpo).build())
if disease_count < self._minimum_disease_count:
val_result = ValidationResultBuilder(phenopacket_id=pp_id).insufficient_disease_count(disease_count, self._minimum_disease_count).build()
validation_results.append(val_result)
if self._allelic_requirement is None:
return validation_results
if self._allelic_requirement == AllelicRequirement.MONO_ALLELIC:
if n_var != 1:
val_result = ValidationResultBuilder(phenopacket_id=pp_id).incorrect_variant_count(self._allelic_requirement, n_var).build()
validation_results.append(val_result)
if n_alleles != 1:
val_result = ValidationResultBuilder(phenopacket_id=pp_id).incorrect_allele_count(self._allelic_requirement, n_alleles).build()
validation_results.append(val_result)
elif self._allelic_requirement == AllelicRequirement.BI_ALLELIC:
if n_var < 1 or n_var > 2:
msg = f"Expected one or two variant for biallelic but got {n_var} variants"
val_result = ValidationResultBuilder(phenopacket_id=pp_id).incorrect_variant_count(self._allelic_requirement, n_var).build()
validation_results.append(val_result)
if n_alleles != 2:
val_result = ValidationResultBuilder(phenopacket_id=pp_id).incorrect_allele_count(self._allelic_requirement, n_alleles).build()
validation_results.append(val_result)
return validation_results
|
validate_individual(individual)
check a single Individual as to whether there are sufficient HPO terms and alleles/variants
Returns:
Type |
Description |
List[ValidationResult]
|
a potential empty list of validations
|
Source code in pyphetools/validation/content_validator.py
| def validate_individual(self, individual:Individual) -> List[ValidationResult]:
"""
check a single Individual as to whether there are sufficient HPO terms and alleles/variants
:returns: a potential empty list of validations
:rtype: List[ValidationResult]
"""
n_pf = len(individual.hpo_terms)
n_var = 0
n_alleles = 0
pp_id = individual.get_phenopacket_id()
for variant_interpretation in individual.interpretation_list:
n_var += 1
if variant_interpretation.variation_descriptor is not None:
vdesc = variant_interpretation.variation_descriptor
if vdesc.allelic_state is not None:
gtype = vdesc.allelic_state
if gtype.label == "heterozygous": # "GENO:0000135"
n_alleles += 1
elif gtype.label == "homozygous": # "GENO:0000136"
n_alleles += 2
elif gtype.label == "hemizygous": # "GENO:0000134"
n_alleles += 1
disease_count = individual.disease_count()
return self._validate(pp_id=pp_id, n_hpo=n_pf, disease_count=disease_count, n_var=n_var, n_alleles=n_alleles)
|
validate_phenopacket(phenopacket)
check a single phenopacket as to whether there are sufficient HPO terms and alleles/variants
Returns:
Type |
Description |
List[ValidationResult]
|
a potential empty list of validations
|
Source code in pyphetools/validation/content_validator.py
| def validate_phenopacket(self, phenopacket) -> List[ValidationResult]:
"""
check a single phenopacket as to whether there are sufficient HPO terms and alleles/variants
:returns: a potential empty list of validations
:rtype: List[ValidationResult]
"""
if isinstance(phenopacket, str):
# the user passed a file
if not os.path.isfile(phenopacket):
raise FileNotFoundError(f"Could not find phenopacket file at '{phenopacket}'")
with open(phenopacket) as f:
data = f.read()
jsondata = json.loads(data)
phpacket = Parse(json.dumps(jsondata), phenopackets.Phenopacket())
elif isinstance(phenopacket, phenopackets.Phenopacket):
phpacket = phenopacket
else:
raise ValueError(f"phenopacket argument must be file path or GA4GH Phenopacket \
object but was {type(phenopacket)}")
pp_id = phpacket.id
n_pf = len(phpacket.phenotypic_features)
if phpacket.interpretations is None:
n_var = 0
n_alleles = 0
else:
n_var = 0
n_alleles = 0
for interpretation in phpacket.interpretations:
if interpretation.diagnosis is not None:
dx = interpretation.diagnosis
for genomic_interpretation in dx.genomic_interpretations:
n_var += 1
vint = genomic_interpretation.variant_interpretation
if vint.variation_descriptor is not None:
vdesc = vint.variation_descriptor
if vdesc.allelic_state is not None:
gtype = vdesc.allelic_state
if gtype.label == "heterozygous": # "GENO:0000135"
n_alleles += 1
elif gtype.label == "homozygous": # "GENO:0000136"
n_alleles += 2
elif gtype.label == "hemizygous": # "GENO:0000134"
n_alleles += 1
disease_count = len(phenopacket.diseases)
return self._validate(pp_id=pp_id, n_hpo=n_pf, disease_count=disease_count, n_var=n_var, n_alleles=n_alleles)
|