Create spiked vcf

`VcfFile` `dataclass`

Represents a VCF file with its name, contents, and header information.

Attributes:

Name	Type	Description
`vcf_file_name`	`str`	The name of the VCF file.
`vcf_contents`	`List[str]`	The contents of the VCF file.
`vcf_header`	`VcfHeader`	The parsed header information of the VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py

@dataclass
class VcfFile:
    """
    Represents a VCF file with its name, contents, and header information.

    Attributes:
        vcf_file_name (str): The name of the VCF file.
        vcf_contents (List[str]): The contents of the VCF file.
        vcf_header (VcfHeader): The parsed header information of the VCF file.
    """

    vcf_file_name: str = None
    vcf_contents: List[str] = None
    vcf_header: VcfHeader = None

    @staticmethod
    def populate_fields(template_vcf: Path):
        """
        Populate the fields of the VcfFile instance using the contents of a template VCF file.

        Args:
            template_vcf (Path): The path to the template VCF file.

        Returns:
            VcfFile: An instance of VcfFile with populated fields.

        """
        contents = read_vcf(template_vcf)
        return VcfFile(template_vcf.name, contents, VcfHeaderParser(contents).parse_vcf_header())

`populate_fields(template_vcf)` `staticmethod`

Populate the fields of the VcfFile instance using the contents of a template VCF file.

Parameters:

Name	Type	Description	Default
`template_vcf`	`Path`	The path to the template VCF file.	required

Returns:

Name	Type	Description
`VcfFile`		An instance of VcfFile with populated fields.

Source code in src/pheval/prepare/create_spiked_vcf.py

@staticmethod
def populate_fields(template_vcf: Path):
    """
    Populate the fields of the VcfFile instance using the contents of a template VCF file.

    Args:
        template_vcf (Path): The path to the template VCF file.

    Returns:
        VcfFile: An instance of VcfFile with populated fields.

    """
    contents = read_vcf(template_vcf)
    return VcfFile(template_vcf.name, contents, VcfHeaderParser(contents).parse_vcf_header())

`VcfHeader` `dataclass`

Data obtained from VCF header.

Parameters:

Name	Type	Description	Default
`sample_id`	`str`	The sample identifier from the VCF header.	required
`assembly`	`str`	The assembly information obtained from the VCF header.	required
`chr_status`	`bool`	A boolean indicating whether the VCF denotes chromosomes as chr or not.	required

Source code in src/pheval/prepare/create_spiked_vcf.py

@dataclass
class VcfHeader:
    """Data obtained from VCF header.

    Args:
        sample_id (str): The sample identifier from the VCF header.
        assembly (str): The assembly information obtained from the VCF header.
        chr_status (bool): A boolean indicating whether the VCF denotes chromosomes as chr or not.
    """

    sample_id: str
    assembly: str
    chr_status: bool

`VcfHeaderParser`

Class for parsing the header of a VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py

class VcfHeaderParser:
    """Class for parsing the header of a VCF file."""

    def __init__(self, vcf_contents: list[str]):
        """
        Initialise the VcfHeaderParser.

        Args:
            vcf_contents (list[str]): The contents of the VCF file as a list of strings.
        """
        self.vcf_contents = vcf_contents

    def parse_assembly(self) -> tuple[str, bool]:
        """
        Parse the genome assembly and format of vcf_records.

        Returns:
            Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False).
        """
        vcf_assembly = {}
        chr_status = False
        for line in self.vcf_contents:
            if line.startswith("##contig=<ID"):
                tokens = line.split(",")
                chromosome = re.sub(
                    r"^.*?ID=", "", [token for token in tokens if "ID=" in token][0]
                )
                if "chr" in chromosome:
                    chr_status = True
                    chromosome = chromosome.replace("chr", "")
                contig_length = re.sub(
                    "[^0-9]+",
                    "",
                    [token for token in tokens if "length=" in token][0],
                )
                vcf_assembly[chromosome] = int(contig_length)
                vcf_assembly = {i: vcf_assembly[i] for i in vcf_assembly if i.isdigit()}
        assembly = [k for k, v in genome_assemblies.items() if v == vcf_assembly][0]
        return assembly, chr_status

    def parse_sample_id(self) -> str:
        """
        Parse the sample ID of the VCF.

        Returns:
            str: The sample ID extracted from the VCF header.
        """
        for line in self.vcf_contents:
            if line.startswith("#CHROM"):
                return line.split("\t")[9].rstrip()

    def parse_vcf_header(self) -> VcfHeader:
        """
        Parse the header of the VCF.

        Returns:
            VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status.
        """
        assembly, chr_status = self.parse_assembly()
        sample_id = self.parse_sample_id()
        return VcfHeader(sample_id, assembly, chr_status)

`init(vcf_contents)`

Initialise the VcfHeaderParser.

Parameters:

Name	Type	Description	Default
`vcf_contents`	`list[str]`	The contents of the VCF file as a list of strings.	required

Source code in src/pheval/prepare/create_spiked_vcf.py

def __init__(self, vcf_contents: list[str]):
    """
    Initialise the VcfHeaderParser.

    Args:
        vcf_contents (list[str]): The contents of the VCF file as a list of strings.
    """
    self.vcf_contents = vcf_contents

`parse_assembly()`

Parse the genome assembly and format of vcf_records.

Returns:

Type	Description
`tuple[str, bool]`	Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False).

Source code in src/pheval/prepare/create_spiked_vcf.py

def parse_assembly(self) -> tuple[str, bool]:
    """
    Parse the genome assembly and format of vcf_records.

    Returns:
        Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False).
    """
    vcf_assembly = {}
    chr_status = False
    for line in self.vcf_contents:
        if line.startswith("##contig=<ID"):
            tokens = line.split(",")
            chromosome = re.sub(
                r"^.*?ID=", "", [token for token in tokens if "ID=" in token][0]
            )
            if "chr" in chromosome:
                chr_status = True
                chromosome = chromosome.replace("chr", "")
            contig_length = re.sub(
                "[^0-9]+",
                "",
                [token for token in tokens if "length=" in token][0],
            )
            vcf_assembly[chromosome] = int(contig_length)
            vcf_assembly = {i: vcf_assembly[i] for i in vcf_assembly if i.isdigit()}
    assembly = [k for k, v in genome_assemblies.items() if v == vcf_assembly][0]
    return assembly, chr_status

`parse_sample_id()`

Parse the sample ID of the VCF.

Returns:

Name	Type	Description
`str`	`str`	The sample ID extracted from the VCF header.

Source code in src/pheval/prepare/create_spiked_vcf.py

def parse_sample_id(self) -> str:
    """
    Parse the sample ID of the VCF.

    Returns:
        str: The sample ID extracted from the VCF header.
    """
    for line in self.vcf_contents:
        if line.startswith("#CHROM"):
            return line.split("\t")[9].rstrip()

`parse_vcf_header()`

Parse the header of the VCF.

Returns:

Name	Type	Description
`VcfHeader`	`VcfHeader`	An instance of VcfHeader containing sample ID, assembly, and chromosome status.

Source code in src/pheval/prepare/create_spiked_vcf.py

def parse_vcf_header(self) -> VcfHeader:
    """
    Parse the header of the VCF.

    Returns:
        VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status.
    """
    assembly, chr_status = self.parse_assembly()
    sample_id = self.parse_sample_id()
    return VcfHeader(sample_id, assembly, chr_status)

`VcfSpiker`

Class for spiking proband variants into template VCF file contents.

Source code in src/pheval/prepare/create_spiked_vcf.py

class VcfSpiker:
    """Class for spiking proband variants into template VCF file contents."""

    def __init__(
        self,
        vcf_contents: list[str],
        proband_causative_variants: list[ProbandCausativeVariant],
        vcf_header: VcfHeader,
    ):
        """
        Initialise the VcfSpiker.

        Args:
            vcf_contents (List[str]): Contents of the template VCF file.
            proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants.
            vcf_header (VcfHeader): The VCF header information.
        """
        self.vcf_contents = vcf_contents
        self.proband_causative_variants = proband_causative_variants
        self.vcf_header = vcf_header

    def construct_variant_entry(self, proband_variant_data: ProbandCausativeVariant) -> List[str]:
        """
        Construct variant entries.

        Args:
            proband_variant_data (ProbandCausativeVariant): Data for the proband variant.

        Returns:
            List[str]: Constructed variant entry as a list of strings.
        """
        genotype_codes = {
            "hemizygous": "0/1",
            "homozygous": "1/1",
            "heterozygous": "0/1",
            "compound heterozygous": "0/1",
        }
        if self.vcf_header.chr_status is True and "chr" not in proband_variant_data.variant.chrom:
            proband_variant_data.variant.chrom = "chr" + proband_variant_data.variant.chrom
        return [
            proband_variant_data.variant.chrom,
            str(proband_variant_data.variant.pos),
            ".",
            proband_variant_data.variant.ref,
            (
                f"<{proband_variant_data.variant.alt}>"
                if proband_variant_data.variant.ref == "N"
                else proband_variant_data.variant.alt
            ),
            "100",
            "PASS",
            proband_variant_data.info if proband_variant_data.info else ".",
            "GT",
            genotype_codes[proband_variant_data.genotype.lower()] + "\n",
        ]

    def construct_vcf_records(self, template_vcf_name: str) -> List[str]:
        """
        Construct updated VCF records by inserting spiked variants into the correct positions within the VCF.

        Args:
            template_vcf_name (str): Name of the template VCF file.

        Returns:
            List[str]: Updated VCF records containing the spiked variants.
        """
        updated_vcf_records = copy(self.vcf_contents)
        for variant in self.proband_causative_variants:
            variant_entry = self.construct_variant_entry(variant)
            matching_indices = [
                i
                for i, val in enumerate(updated_vcf_records)
                if val.split("\t")[0] == variant_entry[0]
                and int(val.split("\t")[1]) < int(variant_entry[1])
            ]
            if matching_indices:
                variant_entry_position = matching_indices[-1] + 1
            else:
                info_log.warning(
                    f"Could not find entry position for {variant.variant.chrom}-{variant.variant.pos}-"
                    f"{variant.variant.ref}-{variant.variant.alt} in {template_vcf_name}, "
                    "inserting at end of VCF contents."
                )
                variant_entry_position = len(updated_vcf_records)
            updated_vcf_records.insert(variant_entry_position, "\t".join(variant_entry))
        return updated_vcf_records

    def construct_header(self, updated_vcf_records: List[str]) -> List[str]:
        """
        Construct the header of the VCF.

        Args:
            updated_vcf_records (List[str]): Updated VCF records.

        Returns:
            List[str]: Constructed header as a list of strings.
        """
        updated_vcf_file = []
        for line in updated_vcf_records:
            if line.startswith("#"):
                text = line.replace(
                    self.vcf_header.sample_id,
                    self.proband_causative_variants[0].proband_id,
                )
            else:
                text = line
            updated_vcf_file.append(text)
        return updated_vcf_file

    def construct_vcf(self, template_vcf_name: str) -> List[str]:
        """
        Construct the entire spiked VCF file by incorporating the spiked variants into the VCF.

        Args:
            template_vcf_name (str): Name of the template VCF file.

        Returns:
            List[str]: The complete spiked VCF file content as a list of strings.
        """
        return self.construct_header(self.construct_vcf_records(template_vcf_name))

`init(vcf_contents, proband_causative_variants, vcf_header)`

Initialise the VcfSpiker.

Parameters:

Name	Type	Description	Default
`vcf_contents`	`List[str]`	Contents of the template VCF file.	required
`proband_causative_variants`	`List[ProbandCausativeVariant]`	List of proband causative variants.	required
`vcf_header`	`VcfHeader`	The VCF header information.	required

Source code in src/pheval/prepare/create_spiked_vcf.py

def __init__(
    self,
    vcf_contents: list[str],
    proband_causative_variants: list[ProbandCausativeVariant],
    vcf_header: VcfHeader,
):
    """
    Initialise the VcfSpiker.

    Args:
        vcf_contents (List[str]): Contents of the template VCF file.
        proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants.
        vcf_header (VcfHeader): The VCF header information.
    """
    self.vcf_contents = vcf_contents
    self.proband_causative_variants = proband_causative_variants
    self.vcf_header = vcf_header

`construct_header(updated_vcf_records)`

Construct the header of the VCF.

Parameters:

Name	Type	Description	Default
`updated_vcf_records`	`List[str]`	Updated VCF records.	required

Returns:

Type	Description
`List[str]`	List[str]: Constructed header as a list of strings.

Source code in src/pheval/prepare/create_spiked_vcf.py

def construct_header(self, updated_vcf_records: List[str]) -> List[str]:
    """
    Construct the header of the VCF.

    Args:
        updated_vcf_records (List[str]): Updated VCF records.

    Returns:
        List[str]: Constructed header as a list of strings.
    """
    updated_vcf_file = []
    for line in updated_vcf_records:
        if line.startswith("#"):
            text = line.replace(
                self.vcf_header.sample_id,
                self.proband_causative_variants[0].proband_id,
            )
        else:
            text = line
        updated_vcf_file.append(text)
    return updated_vcf_file

`construct_variant_entry(proband_variant_data)`

Construct variant entries.

Parameters:

Name	Type	Description	Default
`proband_variant_data`	`ProbandCausativeVariant`	Data for the proband variant.	required

Returns:

Type	Description
`List[str]`	List[str]: Constructed variant entry as a list of strings.

Source code in src/pheval/prepare/create_spiked_vcf.py

def construct_variant_entry(self, proband_variant_data: ProbandCausativeVariant) -> List[str]:
    """
    Construct variant entries.

    Args:
        proband_variant_data (ProbandCausativeVariant): Data for the proband variant.

    Returns:
        List[str]: Constructed variant entry as a list of strings.
    """
    genotype_codes = {
        "hemizygous": "0/1",
        "homozygous": "1/1",
        "heterozygous": "0/1",
        "compound heterozygous": "0/1",
    }
    if self.vcf_header.chr_status is True and "chr" not in proband_variant_data.variant.chrom:
        proband_variant_data.variant.chrom = "chr" + proband_variant_data.variant.chrom
    return [
        proband_variant_data.variant.chrom,
        str(proband_variant_data.variant.pos),
        ".",
        proband_variant_data.variant.ref,
        (
            f"<{proband_variant_data.variant.alt}>"
            if proband_variant_data.variant.ref == "N"
            else proband_variant_data.variant.alt
        ),
        "100",
        "PASS",
        proband_variant_data.info if proband_variant_data.info else ".",
        "GT",
        genotype_codes[proband_variant_data.genotype.lower()] + "\n",
    ]

`construct_vcf(template_vcf_name)`

Construct the entire spiked VCF file by incorporating the spiked variants into the VCF.

Parameters:

Name	Type	Description	Default
`template_vcf_name`	`str`	Name of the template VCF file.	required

Returns:

Type	Description
`List[str]`	List[str]: The complete spiked VCF file content as a list of strings.

Source code in src/pheval/prepare/create_spiked_vcf.py

def construct_vcf(self, template_vcf_name: str) -> List[str]:
    """
    Construct the entire spiked VCF file by incorporating the spiked variants into the VCF.

    Args:
        template_vcf_name (str): Name of the template VCF file.

    Returns:
        List[str]: The complete spiked VCF file content as a list of strings.
    """
    return self.construct_header(self.construct_vcf_records(template_vcf_name))

`construct_vcf_records(template_vcf_name)`

Construct updated VCF records by inserting spiked variants into the correct positions within the VCF.

Parameters:

Name	Type	Description	Default
`template_vcf_name`	`str`	Name of the template VCF file.	required

Returns:

Type	Description
`List[str]`	List[str]: Updated VCF records containing the spiked variants.

Source code in src/pheval/prepare/create_spiked_vcf.py

def construct_vcf_records(self, template_vcf_name: str) -> List[str]:
    """
    Construct updated VCF records by inserting spiked variants into the correct positions within the VCF.

    Args:
        template_vcf_name (str): Name of the template VCF file.

    Returns:
        List[str]: Updated VCF records containing the spiked variants.
    """
    updated_vcf_records = copy(self.vcf_contents)
    for variant in self.proband_causative_variants:
        variant_entry = self.construct_variant_entry(variant)
        matching_indices = [
            i
            for i, val in enumerate(updated_vcf_records)
            if val.split("\t")[0] == variant_entry[0]
            and int(val.split("\t")[1]) < int(variant_entry[1])
        ]
        if matching_indices:
            variant_entry_position = matching_indices[-1] + 1
        else:
            info_log.warning(
                f"Could not find entry position for {variant.variant.chrom}-{variant.variant.pos}-"
                f"{variant.variant.ref}-{variant.variant.alt} in {template_vcf_name}, "
                "inserting at end of VCF contents."
            )
            variant_entry_position = len(updated_vcf_records)
        updated_vcf_records.insert(variant_entry_position, "\t".join(variant_entry))
    return updated_vcf_records

`VcfWriter`

Class for writing VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py

class VcfWriter:
    """Class for writing VCF file."""

    def __init__(
        self,
        vcf_contents: List[str],
        spiked_vcf_file_path: Path,
    ):
        """
        Initialise the VcfWriter class.

        Args:
            vcf_contents (List[str]): Contents of the VCF file to be written.
            spiked_vcf_file_path (Path): Path to the spiked VCF file to be created.
        """
        self.vcf_contents = vcf_contents
        self.spiked_vcf_file_path = spiked_vcf_file_path

    def write_gzip(self) -> None:
        """
        Write the VCF contents to a gzipped VCF file.
        """
        encoded_contents = [line.encode() for line in self.vcf_contents]
        with gzip.open(self.spiked_vcf_file_path, "wb") as f:
            for line in encoded_contents:
                f.write(line)
        f.close()

    def write_uncompressed(self) -> None:
        """
        Write the VCF contents to an uncompressed VCF file.
        """
        with open(self.spiked_vcf_file_path, "w") as file:
            file.writelines(self.vcf_contents)
        file.close()

    def write_vcf_file(self) -> None:
        """
        Write the VCF file based on compression type.

        Determines the file writing method based on the compression type of the spiked VCF file path.
        Writes the VCF contents to the corresponding file format (gzip or uncompressed).
        """
        self.write_gzip() if is_gzipped(self.spiked_vcf_file_path) else self.write_uncompressed()

`init(vcf_contents, spiked_vcf_file_path)`

Initialise the VcfWriter class.

Parameters:

Name	Type	Description	Default
`vcf_contents`	`List[str]`	Contents of the VCF file to be written.	required
`spiked_vcf_file_path`	`Path`	Path to the spiked VCF file to be created.	required

Source code in src/pheval/prepare/create_spiked_vcf.py

def __init__(
    self,
    vcf_contents: List[str],
    spiked_vcf_file_path: Path,
):
    """
    Initialise the VcfWriter class.

    Args:
        vcf_contents (List[str]): Contents of the VCF file to be written.
        spiked_vcf_file_path (Path): Path to the spiked VCF file to be created.
    """
    self.vcf_contents = vcf_contents
    self.spiked_vcf_file_path = spiked_vcf_file_path

`write_gzip()`

Write the VCF contents to a gzipped VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py

def write_gzip(self) -> None:
    """
    Write the VCF contents to a gzipped VCF file.
    """
    encoded_contents = [line.encode() for line in self.vcf_contents]
    with gzip.open(self.spiked_vcf_file_path, "wb") as f:
        for line in encoded_contents:
            f.write(line)
    f.close()

`write_uncompressed()`

Write the VCF contents to an uncompressed VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py

def write_uncompressed(self) -> None:
    """
    Write the VCF contents to an uncompressed VCF file.
    """
    with open(self.spiked_vcf_file_path, "w") as file:
        file.writelines(self.vcf_contents)
    file.close()

`write_vcf_file()`

Write the VCF file based on compression type.

Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed).

Source code in src/pheval/prepare/create_spiked_vcf.py

def write_vcf_file(self) -> None:
    """
    Write the VCF file based on compression type.

    Determines the file writing method based on the compression type of the spiked VCF file path.
    Writes the VCF contents to the corresponding file format (gzip or uncompressed).
    """
    self.write_gzip() if is_gzipped(self.spiked_vcf_file_path) else self.write_uncompressed()

`check_variant_assembly(proband_causative_variants, vcf_header, phenopacket_path)`

Check the assembly of the variant assembly against the VCF.

Parameters:

Name	Type	Description	Default
`proband_causative_variants`	`List[ProbandCausativeVariant]`	A list of causative variants from the proband.	required
`vcf_header`	`VcfHeader`	An instance of VcfHeader representing the VCF file's header.	required
`phenopacket_path`	`Path`	The path to the Phenopacket file.	required

Raises:

Type	Description
`ValueError`	If there are too many or incompatible genome assemblies found.
`IncompatibleGenomeAssemblyError`	If the assembly in the Phenopacket does not match the VCF assembly.

Source code in src/pheval/prepare/create_spiked_vcf.py

def check_variant_assembly(
    proband_causative_variants: list[ProbandCausativeVariant],
    vcf_header: VcfHeader,
    phenopacket_path: Path,
) -> None:
    """
    Check the assembly of the variant assembly against the VCF.

    Args:
        proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband.
        vcf_header (VcfHeader): An instance of VcfHeader representing the VCF file's header.
        phenopacket_path (Path): The path to the Phenopacket file.

    Raises:
        ValueError: If there are too many or incompatible genome assemblies found.
        IncompatibleGenomeAssemblyError: If the assembly in the Phenopacket does not match the VCF assembly.
    """
    compatible_genome_assembly = {"GRCh37", "hg19", "GRCh38", "hg38"}
    phenopacket_assembly = list({variant.assembly for variant in proband_causative_variants})
    if len(phenopacket_assembly) > 1:
        raise ValueError("Too many genome assemblies!")
    if phenopacket_assembly[0] not in compatible_genome_assembly:
        raise IncompatibleGenomeAssemblyError(phenopacket_assembly, phenopacket_path)
    if (
        phenopacket_assembly[0] in {"hg19", "GRCh37"}
        and vcf_header.assembly not in {"hg19", "GRCh37"}
    ) or (
        phenopacket_assembly[0] in {"hg38", "GRCh38"}
        and vcf_header.assembly not in {"hg38", "GRCh38"}
    ):
        raise IncompatibleGenomeAssemblyError(
            assembly=phenopacket_assembly, phenopacket=phenopacket_path
        )