Skip to content

Create noisy phenopackets

HpoRandomiser

Class for randomising phenopacket phenotypic features using Human Phenotype Ontology (HPO).

Source code in src/pheval/prepare/create_noisy_phenopackets.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
class HpoRandomiser:
    """Class for randomising phenopacket phenotypic features using Human Phenotype Ontology (HPO)."""

    def __init__(self, hpo_ontology: ProntoImplementation, scramble_factor: float):
        """
        Initialise the HpoRandomiser.

        Args:
            hpo_ontology (ProntoImplementation): The instance of the HPO ontology.
            scramble_factor (float): A factor for scrambling phenotypic features.
        """
        self.hpo_ontology = hpo_ontology
        self.phenotypic_abnormalities = set(hpo_ontology.roots(predicates=["HP:0000118"]))
        self.scramble_factor = scramble_factor

    def scramble_factor_proportions(self, phenotypic_features: list[PhenotypicFeature]) -> int:
        """
        Calculate the proportion of scrambled HPO terms based on the scramble factor.

        Args:
            phenotypic_features (list[PhenotypicFeature]): List of phenotypic features.

        Returns:
            int: The calculated number of phenotypic features to be scrambled.
        """
        if len(phenotypic_features) == 1:
            return 1
        else:
            return int(round(len(phenotypic_features) * self.scramble_factor, 0))

    def retrieve_hpo_term(self, hpo_id: str) -> PhenotypicFeature:
        """
        Retrieve an HPO term based on the provided HPO ID.

        Args:
            hpo_id (str): The HPO ID of the term to retrieve.

        Returns:
            PhenotypicFeature: The PhenotypicFeature object representing the retrieved HPO term.
        """
        rels = self.hpo_ontology.entity_alias_map(hpo_id)
        hpo_term = "".join(rels[(list(rels.keys())[0])])
        return PhenotypicFeature(type=OntologyClass(id=hpo_id, label=hpo_term))

    @staticmethod
    def retain_real_patient_terms(
        phenotypic_features: List[PhenotypicFeature],
        number_of_scrambled_terms: int,
    ) -> List[PhenotypicFeature]:
        """
        Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms.

        Args:
            phenotypic_features (List[PhenotypicFeature]): List of phenotypic features.
            number_of_scrambled_terms (int): The count of scrambled HPO terms.

        Returns:
            List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms.
        """
        if len(phenotypic_features) > 1:
            number_of_real_id = len(phenotypic_features) - number_of_scrambled_terms
        else:
            number_of_real_id = 1
        return random.sample(phenotypic_features, number_of_real_id)

    def convert_patient_terms_to_parent(
        self,
        phenotypic_features: List[PhenotypicFeature],
        retained_phenotypic_features: List[PhenotypicFeature],
        number_of_scrambled_terms: int,
    ) -> List[PhenotypicFeature]:
        """
        Convert a subset of patient HPO terms to their respective parent terms.

        Args:
            phenotypic_features (List[PhenotypicFeature]): List of all phenotypic features.
            retained_phenotypic_features (List[PhenotypicFeature]): List of retained non-scrambled phenotypic features.
            number_of_scrambled_terms (int): The count of scrambled HPO terms.

        Returns:
            List[PhenotypicFeature]: A list of HPO terms converted to their parent terms.

        Note:
            This method identifies a subset of patient HPO terms that are not retained among the
            non-scrambled phenotypic features and converts them to their respective parent terms.
            It then returns a list of parent HPO terms based on the provided scrambled terms count.
            If no remaining HPO terms are available for conversion, no parent terms are returned.
        """
        remaining_hpo = [i for i in phenotypic_features if i not in retained_phenotypic_features]
        if len(remaining_hpo) == 0:
            number_of_scrambled_terms = 0
        hpo_terms_to_be_changed = list(random.sample(remaining_hpo, number_of_scrambled_terms))
        parent_terms = []
        for term in hpo_terms_to_be_changed:
            if self.hpo_ontology.label(term.type.id).startswith("obsolete"):
                obsolete_term = self.hpo_ontology.entity_metadata_map(term.type.id)
                updated_term = list(obsolete_term.values())[0][0]
                parents = self.hpo_ontology.hierarchical_parents(updated_term)
            else:
                parents = self.hpo_ontology.hierarchical_parents(term.type.id)
            if not parents:
                parent_terms.append(term)
            else:
                parent_terms.append(self.retrieve_hpo_term(random.choice(parents)))
        return parent_terms

    def create_random_hpo_terms(self, number_of_scrambled_terms: int) -> List[PhenotypicFeature]:
        """
        Generate a list of random HPO terms.

        Args:
            number_of_scrambled_terms (int): The count of random HPO terms to be generated.

        Returns:
            List[PhenotypicFeature]: A list of randomly selected HPO terms.
        """
        random_ids = list(
            random.sample(sorted(self.phenotypic_abnormalities), number_of_scrambled_terms)
        )
        return [self.retrieve_hpo_term(random_id) for random_id in random_ids]

    def randomise_hpo_terms(
        self,
        phenotypic_features: List[PhenotypicFeature],
    ) -> List[PhenotypicFeature]:
        """
        Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms.

        Args:
            phenotypic_features (List[PhenotypicFeature]): List of phenotypic features to be randomised.

        Returns:
            List[PhenotypicFeature]: A list of randomised HPO terms.

        Note:
            This method randomises the provided phenotypic features by incorporating three types of HPO terms:
            1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor.
            2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms.
            3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor.

            The method determines the count of terms for each category and combines them to form a final list
            of randomised HPO terms to be used in the phenotypic features.
        """
        number_of_scrambled_terms = self.scramble_factor_proportions(phenotypic_features)
        retained_patient_terms = self.retain_real_patient_terms(
            phenotypic_features, number_of_scrambled_terms
        )
        return (
            retained_patient_terms
            + self.convert_patient_terms_to_parent(
                phenotypic_features, retained_patient_terms, number_of_scrambled_terms
            )
            + self.create_random_hpo_terms(number_of_scrambled_terms)
        )

__init__(hpo_ontology, scramble_factor)

Initialise the HpoRandomiser.

Parameters:

Name Type Description Default
hpo_ontology ProntoImplementation

The instance of the HPO ontology.

required
scramble_factor float

A factor for scrambling phenotypic features.

required
Source code in src/pheval/prepare/create_noisy_phenopackets.py
32
33
34
35
36
37
38
39
40
41
42
def __init__(self, hpo_ontology: ProntoImplementation, scramble_factor: float):
    """
    Initialise the HpoRandomiser.

    Args:
        hpo_ontology (ProntoImplementation): The instance of the HPO ontology.
        scramble_factor (float): A factor for scrambling phenotypic features.
    """
    self.hpo_ontology = hpo_ontology
    self.phenotypic_abnormalities = set(hpo_ontology.roots(predicates=["HP:0000118"]))
    self.scramble_factor = scramble_factor

convert_patient_terms_to_parent(phenotypic_features, retained_phenotypic_features, number_of_scrambled_terms)

Convert a subset of patient HPO terms to their respective parent terms.

Parameters:

Name Type Description Default
phenotypic_features List[PhenotypicFeature]

List of all phenotypic features.

required
retained_phenotypic_features List[PhenotypicFeature]

List of retained non-scrambled phenotypic features.

required
number_of_scrambled_terms int

The count of scrambled HPO terms.

required

Returns:

Type Description
List[PhenotypicFeature]

List[PhenotypicFeature]: A list of HPO terms converted to their parent terms.

Note

This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned.

Source code in src/pheval/prepare/create_noisy_phenopackets.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def convert_patient_terms_to_parent(
    self,
    phenotypic_features: List[PhenotypicFeature],
    retained_phenotypic_features: List[PhenotypicFeature],
    number_of_scrambled_terms: int,
) -> List[PhenotypicFeature]:
    """
    Convert a subset of patient HPO terms to their respective parent terms.

    Args:
        phenotypic_features (List[PhenotypicFeature]): List of all phenotypic features.
        retained_phenotypic_features (List[PhenotypicFeature]): List of retained non-scrambled phenotypic features.
        number_of_scrambled_terms (int): The count of scrambled HPO terms.

    Returns:
        List[PhenotypicFeature]: A list of HPO terms converted to their parent terms.

    Note:
        This method identifies a subset of patient HPO terms that are not retained among the
        non-scrambled phenotypic features and converts them to their respective parent terms.
        It then returns a list of parent HPO terms based on the provided scrambled terms count.
        If no remaining HPO terms are available for conversion, no parent terms are returned.
    """
    remaining_hpo = [i for i in phenotypic_features if i not in retained_phenotypic_features]
    if len(remaining_hpo) == 0:
        number_of_scrambled_terms = 0
    hpo_terms_to_be_changed = list(random.sample(remaining_hpo, number_of_scrambled_terms))
    parent_terms = []
    for term in hpo_terms_to_be_changed:
        if self.hpo_ontology.label(term.type.id).startswith("obsolete"):
            obsolete_term = self.hpo_ontology.entity_metadata_map(term.type.id)
            updated_term = list(obsolete_term.values())[0][0]
            parents = self.hpo_ontology.hierarchical_parents(updated_term)
        else:
            parents = self.hpo_ontology.hierarchical_parents(term.type.id)
        if not parents:
            parent_terms.append(term)
        else:
            parent_terms.append(self.retrieve_hpo_term(random.choice(parents)))
    return parent_terms

create_random_hpo_terms(number_of_scrambled_terms)

Generate a list of random HPO terms.

Parameters:

Name Type Description Default
number_of_scrambled_terms int

The count of random HPO terms to be generated.

required

Returns:

Type Description
List[PhenotypicFeature]

List[PhenotypicFeature]: A list of randomly selected HPO terms.

Source code in src/pheval/prepare/create_noisy_phenopackets.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def create_random_hpo_terms(self, number_of_scrambled_terms: int) -> List[PhenotypicFeature]:
    """
    Generate a list of random HPO terms.

    Args:
        number_of_scrambled_terms (int): The count of random HPO terms to be generated.

    Returns:
        List[PhenotypicFeature]: A list of randomly selected HPO terms.
    """
    random_ids = list(
        random.sample(sorted(self.phenotypic_abnormalities), number_of_scrambled_terms)
    )
    return [self.retrieve_hpo_term(random_id) for random_id in random_ids]

randomise_hpo_terms(phenotypic_features)

Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms.

Parameters:

Name Type Description Default
phenotypic_features List[PhenotypicFeature]

List of phenotypic features to be randomised.

required

Returns:

Type Description
List[PhenotypicFeature]

List[PhenotypicFeature]: A list of randomised HPO terms.

Note

This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor.

The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features.

Source code in src/pheval/prepare/create_noisy_phenopackets.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def randomise_hpo_terms(
    self,
    phenotypic_features: List[PhenotypicFeature],
) -> List[PhenotypicFeature]:
    """
    Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms.

    Args:
        phenotypic_features (List[PhenotypicFeature]): List of phenotypic features to be randomised.

    Returns:
        List[PhenotypicFeature]: A list of randomised HPO terms.

    Note:
        This method randomises the provided phenotypic features by incorporating three types of HPO terms:
        1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor.
        2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms.
        3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor.

        The method determines the count of terms for each category and combines them to form a final list
        of randomised HPO terms to be used in the phenotypic features.
    """
    number_of_scrambled_terms = self.scramble_factor_proportions(phenotypic_features)
    retained_patient_terms = self.retain_real_patient_terms(
        phenotypic_features, number_of_scrambled_terms
    )
    return (
        retained_patient_terms
        + self.convert_patient_terms_to_parent(
            phenotypic_features, retained_patient_terms, number_of_scrambled_terms
        )
        + self.create_random_hpo_terms(number_of_scrambled_terms)
    )

retain_real_patient_terms(phenotypic_features, number_of_scrambled_terms) staticmethod

Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms.

Parameters:

Name Type Description Default
phenotypic_features List[PhenotypicFeature]

List of phenotypic features.

required
number_of_scrambled_terms int

The count of scrambled HPO terms.

required

Returns:

Type Description
List[PhenotypicFeature]

List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms.

Source code in src/pheval/prepare/create_noisy_phenopackets.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
@staticmethod
def retain_real_patient_terms(
    phenotypic_features: List[PhenotypicFeature],
    number_of_scrambled_terms: int,
) -> List[PhenotypicFeature]:
    """
    Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms.

    Args:
        phenotypic_features (List[PhenotypicFeature]): List of phenotypic features.
        number_of_scrambled_terms (int): The count of scrambled HPO terms.

    Returns:
        List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms.
    """
    if len(phenotypic_features) > 1:
        number_of_real_id = len(phenotypic_features) - number_of_scrambled_terms
    else:
        number_of_real_id = 1
    return random.sample(phenotypic_features, number_of_real_id)

retrieve_hpo_term(hpo_id)

Retrieve an HPO term based on the provided HPO ID.

Parameters:

Name Type Description Default
hpo_id str

The HPO ID of the term to retrieve.

required

Returns:

Name Type Description
PhenotypicFeature PhenotypicFeature

The PhenotypicFeature object representing the retrieved HPO term.

Source code in src/pheval/prepare/create_noisy_phenopackets.py
59
60
61
62
63
64
65
66
67
68
69
70
71
def retrieve_hpo_term(self, hpo_id: str) -> PhenotypicFeature:
    """
    Retrieve an HPO term based on the provided HPO ID.

    Args:
        hpo_id (str): The HPO ID of the term to retrieve.

    Returns:
        PhenotypicFeature: The PhenotypicFeature object representing the retrieved HPO term.
    """
    rels = self.hpo_ontology.entity_alias_map(hpo_id)
    hpo_term = "".join(rels[(list(rels.keys())[0])])
    return PhenotypicFeature(type=OntologyClass(id=hpo_id, label=hpo_term))

scramble_factor_proportions(phenotypic_features)

Calculate the proportion of scrambled HPO terms based on the scramble factor.

Parameters:

Name Type Description Default
phenotypic_features list[PhenotypicFeature]

List of phenotypic features.

required

Returns:

Name Type Description
int int

The calculated number of phenotypic features to be scrambled.

Source code in src/pheval/prepare/create_noisy_phenopackets.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def scramble_factor_proportions(self, phenotypic_features: list[PhenotypicFeature]) -> int:
    """
    Calculate the proportion of scrambled HPO terms based on the scramble factor.

    Args:
        phenotypic_features (list[PhenotypicFeature]): List of phenotypic features.

    Returns:
        int: The calculated number of phenotypic features to be scrambled.
    """
    if len(phenotypic_features) == 1:
        return 1
    else:
        return int(round(len(phenotypic_features) * self.scramble_factor, 0))

add_noise_to_phenotypic_profile(hpo_randomiser, phenopacket)

Randomise the phenotypic profile of a Phenopacket or Family.

Parameters:

Name Type Description Default
hpo_randomiser HpoRandomiser

An instance of HpoRandomiser used for randomisation.

required
phenopacket Union[Phenopacket, Family]

The Phenopacket or Family to be randomised.

required

Returns:

Type Description
Union[Phenopacket, Family]

Union[Phenopacket, Family]: The randomised Phenopacket or Family.

Source code in src/pheval/prepare/create_noisy_phenopackets.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def add_noise_to_phenotypic_profile(
    hpo_randomiser: HpoRandomiser,
    phenopacket: Union[Phenopacket, Family],
) -> Union[Phenopacket, Family]:
    """
    Randomise the phenotypic profile of a Phenopacket or Family.

    Args:
        hpo_randomiser (HpoRandomiser): An instance of HpoRandomiser used for randomisation.
        phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family to be randomised.

    Returns:
        Union[Phenopacket, Family]: The randomised Phenopacket or Family.
    """
    phenotypic_features = PhenopacketUtil(phenopacket).observed_phenotypic_features()
    random_phenotypes = hpo_randomiser.randomise_hpo_terms(phenotypic_features)
    randomised_phenopacket = PhenopacketRebuilder(phenopacket).add_randomised_hpo(random_phenotypes)
    return randomised_phenopacket

create_scrambled_phenopacket(output_dir, phenopacket_path, scramble_factor)

Create a scrambled version of a Phenopacket.

Parameters:

Name Type Description Default
output_dir Path

The directory to store the output scrambled Phenopacket.

required
phenopacket_path Path

The path to the original Phenopacket file.

required
scramble_factor float

A factor determining the level of scrambling for phenotypic features.

required
Source code in src/pheval/prepare/create_noisy_phenopackets.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
def create_scrambled_phenopacket(
    output_dir: Path, phenopacket_path: Path, scramble_factor: float
) -> None:
    """
    Create a scrambled version of a Phenopacket.

    Args:
        output_dir (Path): The directory to store the output scrambled Phenopacket.
        phenopacket_path (Path): The path to the original Phenopacket file.
        scramble_factor (float): A factor determining the level of scrambling for phenotypic features.
    """
    ontology = load_ontology()
    hpo_randomiser = HpoRandomiser(ontology, scramble_factor)
    phenopacket = phenopacket_reader(phenopacket_path)
    created_noisy_phenopacket = add_noise_to_phenotypic_profile(
        hpo_randomiser,
        phenopacket,
    )
    write_phenopacket(
        created_noisy_phenopacket,
        output_dir.joinpath(phenopacket_path.name),
    )

create_scrambled_phenopackets(output_dir, phenopacket_dir, scramble_factor)

Create scrambled versions of Phenopackets within a directory.

Parameters:

Name Type Description Default
output_dir Path

The directory to store the output scrambled Phenopackets.

required
phenopacket_dir Path

The directory containing the original Phenopacket files.

required
scramble_factor float

A factor determining the level of scrambling for phenotypic features.

required
Source code in src/pheval/prepare/create_noisy_phenopackets.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
def create_scrambled_phenopackets(
    output_dir: Path, phenopacket_dir: Path, scramble_factor: float
) -> None:
    """
    Create scrambled versions of Phenopackets within a directory.

    Args:
        output_dir (Path): The directory to store the output scrambled Phenopackets.
        phenopacket_dir (Path): The directory containing the original Phenopacket files.
        scramble_factor (float): A factor determining the level of scrambling for phenotypic features.
    """
    ontology = load_ontology()
    hpo_randomiser = HpoRandomiser(ontology, scramble_factor)
    phenopacket_files = files_with_suffix(phenopacket_dir, ".json")
    for phenopacket_path in phenopacket_files:
        phenopacket = phenopacket_reader(phenopacket_path)
        created_noisy_phenopacket = add_noise_to_phenotypic_profile(hpo_randomiser, phenopacket)
        write_phenopacket(
            created_noisy_phenopacket,
            output_dir.joinpath(
                phenopacket_path.name,
            ),
        )

load_ontology()

Load the Human Phenotype Ontology (HPO).

Returns:

Name Type Description
ProntoImplementation

An instance of ProntoImplementation containing the loaded HPO.

Source code in src/pheval/prepare/create_noisy_phenopackets.py
18
19
20
21
22
23
24
25
26
def load_ontology():
    """
    Load the Human Phenotype Ontology (HPO).

    Returns:
        ProntoImplementation: An instance of ProntoImplementation containing the loaded HPO.
    """
    resource = OntologyResource(slug="hp.obo", local=False)
    return ProntoImplementation(resource)

scramble_phenopackets(output_dir, phenopacket_path, phenopacket_dir, scramble_factor)

Create scrambled phenopackets from either a single phenopacket or a directory of phenopackets.

Parameters:

Name Type Description Default
output_dir Path

The directory to store the output scrambled Phenopackets.

required
phenopacket_path Path

The path to a single Phenopacket file (if applicable).

required
phenopacket_dir Path

The directory containing multiple Phenopacket files (if applicable).

required
scramble_factor float

A factor determining the level of scrambling for phenotypic features.

required
Source code in src/pheval/prepare/create_noisy_phenopackets.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def scramble_phenopackets(
    output_dir: Path, phenopacket_path: Path, phenopacket_dir: Path, scramble_factor: float
) -> None:
    """
    Create scrambled phenopackets from either a single phenopacket or a directory of phenopackets.

    Args:
        output_dir (Path): The directory to store the output scrambled Phenopackets.
        phenopacket_path (Path): The path to a single Phenopacket file (if applicable).
        phenopacket_dir (Path): The directory containing multiple Phenopacket files (if applicable).
        scramble_factor (float): A factor determining the level of scrambling for phenotypic features.
    """
    output_dir.mkdir(exist_ok=True)
    if phenopacket_path is not None:
        create_scrambled_phenopacket(output_dir, phenopacket_path, scramble_factor)
    elif phenopacket_dir is not None:
        create_scrambled_phenopackets(output_dir, phenopacket_dir, scramble_factor)