Skip to content

VariantColumnMapper

Column mapper for the variants identified in individuals

To use this mapper, first create a dictionary with all variants in the cohort using the VariantValidator (for HGVS) and the StructuralVariant classes. The key to the variant is the cell contents of the column with variant information in the original table. The Values are the Variant objects (implemented as HgvsVariant or StructuralVariant). This mapper will split cells that contain multiple variants if needed and also will add genotype information.

Parameters:

Name Type Description Default
variant_d Dict[str,Variant]

Dictionary with all variants found in the column

required
variant_column_name str

name of the variant column in the original table

required
genotype_column_name str

name of the genotype column in the original table, optional

None
delimiter str

symbol used to separate variants if more than one variant is provided, optional

None
Source code in pyphetools/creation/variant_column_mapper.py
class VariantColumnMapper:
    """Column mapper for the variants identified in individuals

    To use this mapper, first create a dictionary with all variants in the cohort using the
    VariantValidator (for HGVS) and the StructuralVariant classes. The key to the variant is
    the cell contents of the column with variant information in the original table. The Values
    are the Variant objects (implemented as HgvsVariant or StructuralVariant). This mapper will
    split cells that contain multiple variants if needed and also will add genotype information.

    :param variant_d: Dictionary with all variants found in the column
    :type variant_d: Dict[str,Variant]
    :param variant_column_name: name of the variant column in the original table
    :type variant_column_name: str
    :param genotype_column_name: name of the genotype column in the original table, optional
    :type genotype_column_name: str
    :param delimiter: symbol used to separate variants if more than one variant is provided, optional
    :type delimiter: str
    """
    def __init__(self,
                variant_d,
                variant_column_name,
                genotype_column_name=None,
                default_genotype=None,
                delimiter=None) -> None:

        if default_genotype is not None and default_genotype not in ACCEPTABLE_GENOTYPES:
            raise ValueError(f"Did not recognize default genotype {default_genotype}")
        if not isinstance(variant_d, dict):
            raise ValueError(f"Argument variant_d must be a dictionary but was {type(variant_d)}")
        self._variant_d = variant_d
        self._default_genotype = default_genotype
        self._variant_column_name = variant_column_name
        self._genotype_column_name = genotype_column_name
        self._delimiter = delimiter


    def _get_genotype(self, genotype_contents):
        """
        Get a genotype string

        get the genotype from the argument (which comes from the genotype column if available) or from the
        default genotype.
        :returns: one of "heterozygous", "homozygous", "hemizygous", None
        """
        if genotype_contents is not None and genotype_contents.lower() in ACCEPTABLE_GENOTYPES:
            return genotype_contents.lower()
        else:
            return self._default_genotype


    def map_cell(self, cell_contents, genotype_contents=None, delimiter=None) -> List[Variant]:
        """
        Map the contents of a variant cell (and optionally a genotype cell).

        If the delimiter is not None, search for the delimiter and split the cell into two variants
        :param cell_contents: contents of the original table cell representing the variant string
        :type cell_contents: str
        :param genotype_contents: contents of the original table cell representing the genotype (allelic status), optional
        :type genotype_contents: str
        :param delimiter: string or character that splits the cell contents into multiple entries, e.g., ";", optional
        :type delimiter: str
        """
        if delimiter is None:
            delimiter = self._delimiter
        if delimiter is not None:
            items = [x.strip() for x in cell_contents.split(delimiter)]
        else:
            items = [cell_contents]
        variant_interpretation_list = []
        for item in items:
            if item in self._variant_d:
                variant: Variant = self._variant_d.get(item)
                gt = self._get_genotype(genotype_contents)
                if gt is not None:
                    variant.set_genotype(gt)
                interpretation = variant.to_ga4gh_variant_interpretation()
                variant_interpretation_list.append(interpretation)
            else:
                raise ValueError(f"Did not recognize variant string \"{item}\"")

        return variant_interpretation_list



    def preview_column(self, column) -> pd.DataFrame:
        if not isinstance(column, pd.Series):
            raise ValueError("column argument must be pandas Series, but was {type(column)}")
        dlist = []
        for _, value in column.items():
            variant_interpretation_list = self.map_cell(str(value))
            if len(variant_interpretation_list) == 0:
                dlist.append({"variant": "n/a"})
            else:
                result_strings = []
                for v in variant_interpretation_list:
                    result_strings.append(v.__str__())
                dlist.append({"variant": ": ".join(result_strings)})
        return pd.DataFrame(dlist)

    def get_variant_column_name(self):
        return self._variant_column_name

    def get_genotype_colname(self):
        return self._genotype_column_name

map_cell(cell_contents, genotype_contents=None, delimiter=None)

Map the contents of a variant cell (and optionally a genotype cell).

If the delimiter is not None, search for the delimiter and split the cell into two variants

Parameters:

Name Type Description Default
cell_contents str

contents of the original table cell representing the variant string

required
genotype_contents str

contents of the original table cell representing the genotype (allelic status), optional

None
delimiter str

string or character that splits the cell contents into multiple entries, e.g., ";", optional

None
Source code in pyphetools/creation/variant_column_mapper.py
def map_cell(self, cell_contents, genotype_contents=None, delimiter=None) -> List[Variant]:
    """
    Map the contents of a variant cell (and optionally a genotype cell).

    If the delimiter is not None, search for the delimiter and split the cell into two variants
    :param cell_contents: contents of the original table cell representing the variant string
    :type cell_contents: str
    :param genotype_contents: contents of the original table cell representing the genotype (allelic status), optional
    :type genotype_contents: str
    :param delimiter: string or character that splits the cell contents into multiple entries, e.g., ";", optional
    :type delimiter: str
    """
    if delimiter is None:
        delimiter = self._delimiter
    if delimiter is not None:
        items = [x.strip() for x in cell_contents.split(delimiter)]
    else:
        items = [cell_contents]
    variant_interpretation_list = []
    for item in items:
        if item in self._variant_d:
            variant: Variant = self._variant_d.get(item)
            gt = self._get_genotype(genotype_contents)
            if gt is not None:
                variant.set_genotype(gt)
            interpretation = variant.to_ga4gh_variant_interpretation()
            variant_interpretation_list.append(interpretation)
        else:
            raise ValueError(f"Did not recognize variant string \"{item}\"")

    return variant_interpretation_list