Skip to content

HpoExactConceptRecognizer

Bases: HpoBaseConceptRecognizer

Source code in pyphetools/creation/hpo_exact_cr.py
class HpoExactConceptRecognizer(HpoBaseConceptRecognizer):

    @staticmethod
    def from_hpo(hpo: hpotk.Ontology):
        label_to_id = get_label_to_id_map(hpo)
        id_to_primary_label = get_id_to_label_map(hpo)

        return HpoExactConceptRecognizer(
            label_to_id=label_to_id,
            id_to_primary_label=id_to_primary_label,
        )

    def __init__(self, **kwargs):
        super(HpoExactConceptRecognizer, self).__init__(**kwargs)

    def _find_hpo_term_in_lc_chunk(self, lc_chunk) -> typing.List[HpTerm]:
        hits = []
        for lower_case_hp_label, hpo_tid in self._label_to_id.items():
            key = lower_case_hp_label.lower()
            startpos = lc_chunk.find(key)
            endpos = startpos + len(key) - 1
            if startpos < 0:
                continue
            # If we get here, we demand that the match is a complete word
            # This is because otherwise we get some spurious matches such as Pica HP:0011856 matching to typical
            # Create a regex to enforce the match is at word boundary
            BOUNDARY_REGEX = re.compile(r'\b%s\b' % key, re.I)
            if BOUNDARY_REGEX.search(lc_chunk):
                hp_term = super(HpoExactConceptRecognizer, self).get_term_from_id(
                    hpo_id=hpo_tid)  # Get properly capitalized label
                hits.append(ConceptMatch(term=hp_term, start=startpos, end=endpos))
        return hits