Bases: HpoBaseConceptRecognizer
Source code in pyphetools/creation/hpo_exact_cr.py
| class HpoExactConceptRecognizer(HpoBaseConceptRecognizer):
@staticmethod
def from_hpo(hpo: hpotk.Ontology):
label_to_id = get_label_to_id_map(hpo)
id_to_primary_label = get_id_to_label_map(hpo)
return HpoExactConceptRecognizer(
label_to_id=label_to_id,
id_to_primary_label=id_to_primary_label,
)
def __init__(self, **kwargs):
super(HpoExactConceptRecognizer, self).__init__(**kwargs)
def _find_hpo_term_in_lc_chunk(self, lc_chunk) -> typing.List[HpTerm]:
hits = []
for lower_case_hp_label, hpo_tid in self._label_to_id.items():
key = lower_case_hp_label.lower()
startpos = lc_chunk.find(key)
endpos = startpos + len(key) - 1
if startpos < 0:
continue
# If we get here, we demand that the match is a complete word
# This is because otherwise we get some spurious matches such as Pica HP:0011856 matching to typical
# Create a regex to enforce the match is at word boundary
BOUNDARY_REGEX = re.compile(r'\b%s\b' % key, re.I)
if BOUNDARY_REGEX.search(lc_chunk):
hp_term = super(HpoExactConceptRecognizer, self).get_term_from_id(
hpo_id=hpo_tid) # Get properly capitalized label
hits.append(ConceptMatch(term=hp_term, start=startpos, end=endpos))
return hits
|