Skip to content

Create spiked vcf

VcfFile dataclass

Represents a VCF file with its name, contents, and header information.

Attributes:

Name Type Description
vcf_file_name str

The name of the VCF file.

vcf_contents List[str]

The contents of the VCF file.

vcf_header VcfHeader

The parsed header information of the VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
@dataclass
class VcfFile:
    """
    Represents a VCF file with its name, contents, and header information.

    Attributes:
        vcf_file_name (str): The name of the VCF file.
        vcf_contents (List[str]): The contents of the VCF file.
        vcf_header (VcfHeader): The parsed header information of the VCF file.
    """

    vcf_file_name: str = None
    vcf_contents: List[str] = None
    vcf_header: VcfHeader = None

    @staticmethod
    def populate_fields(template_vcf: Path):
        """
        Populate the fields of the VcfFile instance using the contents of a template VCF file.

        Args:
            template_vcf (Path): The path to the template VCF file.

        Returns:
            VcfFile: An instance of VcfFile with populated fields.

        """
        contents = read_vcf(template_vcf)
        return VcfFile(template_vcf.name, contents, VcfHeaderParser(contents).parse_vcf_header())

populate_fields(template_vcf) staticmethod

Populate the fields of the VcfFile instance using the contents of a template VCF file.

Parameters:

Name Type Description Default
template_vcf Path

The path to the template VCF file.

required

Returns:

Name Type Description
VcfFile

An instance of VcfFile with populated fields.

Source code in src/pheval/prepare/create_spiked_vcf.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
@staticmethod
def populate_fields(template_vcf: Path):
    """
    Populate the fields of the VcfFile instance using the contents of a template VCF file.

    Args:
        template_vcf (Path): The path to the template VCF file.

    Returns:
        VcfFile: An instance of VcfFile with populated fields.

    """
    contents = read_vcf(template_vcf)
    return VcfFile(template_vcf.name, contents, VcfHeaderParser(contents).parse_vcf_header())

VcfHeader dataclass

Data obtained from VCF header.

Parameters:

Name Type Description Default
sample_id str

The sample identifier from the VCF header.

required
assembly str

The assembly information obtained from the VCF header.

required
chr_status bool

A boolean indicating whether the VCF denotes chromosomes as chr or not.

required
Source code in src/pheval/prepare/create_spiked_vcf.py
78
79
80
81
82
83
84
85
86
87
88
89
90
@dataclass
class VcfHeader:
    """Data obtained from VCF header.

    Args:
        sample_id (str): The sample identifier from the VCF header.
        assembly (str): The assembly information obtained from the VCF header.
        chr_status (bool): A boolean indicating whether the VCF denotes chromosomes as chr or not.
    """

    sample_id: str
    assembly: str
    chr_status: bool

VcfHeaderParser

Class for parsing the header of a VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
class VcfHeaderParser:
    """Class for parsing the header of a VCF file."""

    def __init__(self, vcf_contents: list[str]):
        """
        Initialise the VcfHeaderParser.

        Args:
            vcf_contents (list[str]): The contents of the VCF file as a list of strings.
        """
        self.vcf_contents = vcf_contents

    def parse_assembly(self) -> tuple[str, bool]:
        """
        Parse the genome assembly and format of vcf_records.

        Returns:
            Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False).
        """
        vcf_assembly = {}
        chr_status = False
        for line in self.vcf_contents:
            if line.startswith("##contig=<ID"):
                tokens = line.split(",")
                chromosome = re.sub(
                    r"^.*?ID=", "", [token for token in tokens if "ID=" in token][0]
                )
                if "chr" in chromosome:
                    chr_status = True
                    chromosome = chromosome.replace("chr", "")
                contig_length = re.sub(
                    "[^0-9]+",
                    "",
                    [token for token in tokens if "length=" in token][0],
                )
                vcf_assembly[chromosome] = int(contig_length)
                vcf_assembly = {i: vcf_assembly[i] for i in vcf_assembly if i.isdigit()}
        assembly = [k for k, v in genome_assemblies.items() if v == vcf_assembly][0]
        return assembly, chr_status

    def parse_sample_id(self) -> str:
        """
        Parse the sample ID of the VCF.

        Returns:
            str: The sample ID extracted from the VCF header.
        """
        for line in self.vcf_contents:
            if line.startswith("#CHROM"):
                return line.split("\t")[9].rstrip()

    def parse_vcf_header(self) -> VcfHeader:
        """
        Parse the header of the VCF.

        Returns:
            VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status.
        """
        assembly, chr_status = self.parse_assembly()
        sample_id = self.parse_sample_id()
        return VcfHeader(sample_id, assembly, chr_status)

__init__(vcf_contents)

Initialise the VcfHeaderParser.

Parameters:

Name Type Description Default
vcf_contents list[str]

The contents of the VCF file as a list of strings.

required
Source code in src/pheval/prepare/create_spiked_vcf.py
115
116
117
118
119
120
121
122
def __init__(self, vcf_contents: list[str]):
    """
    Initialise the VcfHeaderParser.

    Args:
        vcf_contents (list[str]): The contents of the VCF file as a list of strings.
    """
    self.vcf_contents = vcf_contents

parse_assembly()

Parse the genome assembly and format of vcf_records.

Returns:

Type Description
tuple[str, bool]

Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False).

Source code in src/pheval/prepare/create_spiked_vcf.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def parse_assembly(self) -> tuple[str, bool]:
    """
    Parse the genome assembly and format of vcf_records.

    Returns:
        Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False).
    """
    vcf_assembly = {}
    chr_status = False
    for line in self.vcf_contents:
        if line.startswith("##contig=<ID"):
            tokens = line.split(",")
            chromosome = re.sub(
                r"^.*?ID=", "", [token for token in tokens if "ID=" in token][0]
            )
            if "chr" in chromosome:
                chr_status = True
                chromosome = chromosome.replace("chr", "")
            contig_length = re.sub(
                "[^0-9]+",
                "",
                [token for token in tokens if "length=" in token][0],
            )
            vcf_assembly[chromosome] = int(contig_length)
            vcf_assembly = {i: vcf_assembly[i] for i in vcf_assembly if i.isdigit()}
    assembly = [k for k, v in genome_assemblies.items() if v == vcf_assembly][0]
    return assembly, chr_status

parse_sample_id()

Parse the sample ID of the VCF.

Returns:

Name Type Description
str str

The sample ID extracted from the VCF header.

Source code in src/pheval/prepare/create_spiked_vcf.py
152
153
154
155
156
157
158
159
160
161
def parse_sample_id(self) -> str:
    """
    Parse the sample ID of the VCF.

    Returns:
        str: The sample ID extracted from the VCF header.
    """
    for line in self.vcf_contents:
        if line.startswith("#CHROM"):
            return line.split("\t")[9].rstrip()

parse_vcf_header()

Parse the header of the VCF.

Returns:

Name Type Description
VcfHeader VcfHeader

An instance of VcfHeader containing sample ID, assembly, and chromosome status.

Source code in src/pheval/prepare/create_spiked_vcf.py
163
164
165
166
167
168
169
170
171
172
def parse_vcf_header(self) -> VcfHeader:
    """
    Parse the header of the VCF.

    Returns:
        VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status.
    """
    assembly, chr_status = self.parse_assembly()
    sample_id = self.parse_sample_id()
    return VcfHeader(sample_id, assembly, chr_status)

VcfSpiker

Class for spiking proband variants into template VCF file contents.

Source code in src/pheval/prepare/create_spiked_vcf.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
class VcfSpiker:
    """Class for spiking proband variants into template VCF file contents."""

    def __init__(
        self,
        vcf_contents: list[str],
        proband_causative_variants: list[ProbandCausativeVariant],
        vcf_header: VcfHeader,
    ):
        """
        Initialise the VcfSpiker.

        Args:
            vcf_contents (List[str]): Contents of the template VCF file.
            proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants.
            vcf_header (VcfHeader): The VCF header information.
        """
        self.vcf_contents = vcf_contents
        self.proband_causative_variants = proband_causative_variants
        self.vcf_header = vcf_header

    def construct_variant_entry(self, proband_variant_data: ProbandCausativeVariant) -> List[str]:
        """
        Construct variant entries.

        Args:
            proband_variant_data (ProbandCausativeVariant): Data for the proband variant.

        Returns:
            List[str]: Constructed variant entry as a list of strings.
        """
        genotype_codes = {
            "hemizygous": "0/1",
            "homozygous": "1/1",
            "heterozygous": "0/1",
            "compound heterozygous": "0/1",
        }
        if self.vcf_header.chr_status is True and "chr" not in proband_variant_data.variant.chrom:
            proband_variant_data.variant.chrom = "chr" + proband_variant_data.variant.chrom
        return [
            proband_variant_data.variant.chrom,
            str(proband_variant_data.variant.pos),
            ".",
            proband_variant_data.variant.ref,
            (
                f"<{proband_variant_data.variant.alt}>"
                if proband_variant_data.variant.ref == "N"
                else proband_variant_data.variant.alt
            ),
            "100",
            "PASS",
            proband_variant_data.info if proband_variant_data.info else ".",
            "GT",
            genotype_codes[proband_variant_data.genotype.lower()] + "\n",
        ]

    def construct_vcf_records(self, template_vcf_name: str) -> List[str]:
        """
        Construct updated VCF records by inserting spiked variants into the correct positions within the VCF.

        Args:
            template_vcf_name (str): Name of the template VCF file.

        Returns:
            List[str]: Updated VCF records containing the spiked variants.
        """
        updated_vcf_records = copy(self.vcf_contents)
        for variant in self.proband_causative_variants:
            variant_entry = self.construct_variant_entry(variant)
            matching_indices = [
                i
                for i, val in enumerate(updated_vcf_records)
                if val.split("\t")[0] == variant_entry[0]
                and int(val.split("\t")[1]) < int(variant_entry[1])
            ]
            if matching_indices:
                variant_entry_position = matching_indices[-1] + 1
            else:
                info_log.warning(
                    f"Could not find entry position for {variant.variant.chrom}-{variant.variant.pos}-"
                    f"{variant.variant.ref}-{variant.variant.alt} in {template_vcf_name}, "
                    "inserting at end of VCF contents."
                )
                variant_entry_position = len(updated_vcf_records)
            updated_vcf_records.insert(variant_entry_position, "\t".join(variant_entry))
        return updated_vcf_records

    def construct_header(self, updated_vcf_records: List[str]) -> List[str]:
        """
        Construct the header of the VCF.

        Args:
            updated_vcf_records (List[str]): Updated VCF records.

        Returns:
            List[str]: Constructed header as a list of strings.
        """
        updated_vcf_file = []
        for line in updated_vcf_records:
            if line.startswith("#"):
                text = line.replace(
                    self.vcf_header.sample_id,
                    self.proband_causative_variants[0].proband_id,
                )
            else:
                text = line
            updated_vcf_file.append(text)
        return updated_vcf_file

    def construct_vcf(self, template_vcf_name: str) -> List[str]:
        """
        Construct the entire spiked VCF file by incorporating the spiked variants into the VCF.

        Args:
            template_vcf_name (str): Name of the template VCF file.

        Returns:
            List[str]: The complete spiked VCF file content as a list of strings.
        """
        return self.construct_header(self.construct_vcf_records(template_vcf_name))

__init__(vcf_contents, proband_causative_variants, vcf_header)

Initialise the VcfSpiker.

Parameters:

Name Type Description Default
vcf_contents List[str]

Contents of the template VCF file.

required
proband_causative_variants List[ProbandCausativeVariant]

List of proband causative variants.

required
vcf_header VcfHeader

The VCF header information.

required
Source code in src/pheval/prepare/create_spiked_vcf.py
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
def __init__(
    self,
    vcf_contents: list[str],
    proband_causative_variants: list[ProbandCausativeVariant],
    vcf_header: VcfHeader,
):
    """
    Initialise the VcfSpiker.

    Args:
        vcf_contents (List[str]): Contents of the template VCF file.
        proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants.
        vcf_header (VcfHeader): The VCF header information.
    """
    self.vcf_contents = vcf_contents
    self.proband_causative_variants = proband_causative_variants
    self.vcf_header = vcf_header

construct_header(updated_vcf_records)

Construct the header of the VCF.

Parameters:

Name Type Description Default
updated_vcf_records List[str]

Updated VCF records.

required

Returns:

Type Description
List[str]

List[str]: Constructed header as a list of strings.

Source code in src/pheval/prepare/create_spiked_vcf.py
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
def construct_header(self, updated_vcf_records: List[str]) -> List[str]:
    """
    Construct the header of the VCF.

    Args:
        updated_vcf_records (List[str]): Updated VCF records.

    Returns:
        List[str]: Constructed header as a list of strings.
    """
    updated_vcf_file = []
    for line in updated_vcf_records:
        if line.startswith("#"):
            text = line.replace(
                self.vcf_header.sample_id,
                self.proband_causative_variants[0].proband_id,
            )
        else:
            text = line
        updated_vcf_file.append(text)
    return updated_vcf_file

construct_variant_entry(proband_variant_data)

Construct variant entries.

Parameters:

Name Type Description Default
proband_variant_data ProbandCausativeVariant

Data for the proband variant.

required

Returns:

Type Description
List[str]

List[str]: Constructed variant entry as a list of strings.

Source code in src/pheval/prepare/create_spiked_vcf.py
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
def construct_variant_entry(self, proband_variant_data: ProbandCausativeVariant) -> List[str]:
    """
    Construct variant entries.

    Args:
        proband_variant_data (ProbandCausativeVariant): Data for the proband variant.

    Returns:
        List[str]: Constructed variant entry as a list of strings.
    """
    genotype_codes = {
        "hemizygous": "0/1",
        "homozygous": "1/1",
        "heterozygous": "0/1",
        "compound heterozygous": "0/1",
    }
    if self.vcf_header.chr_status is True and "chr" not in proband_variant_data.variant.chrom:
        proband_variant_data.variant.chrom = "chr" + proband_variant_data.variant.chrom
    return [
        proband_variant_data.variant.chrom,
        str(proband_variant_data.variant.pos),
        ".",
        proband_variant_data.variant.ref,
        (
            f"<{proband_variant_data.variant.alt}>"
            if proband_variant_data.variant.ref == "N"
            else proband_variant_data.variant.alt
        ),
        "100",
        "PASS",
        proband_variant_data.info if proband_variant_data.info else ".",
        "GT",
        genotype_codes[proband_variant_data.genotype.lower()] + "\n",
    ]

construct_vcf(template_vcf_name)

Construct the entire spiked VCF file by incorporating the spiked variants into the VCF.

Parameters:

Name Type Description Default
template_vcf_name str

Name of the template VCF file.

required

Returns:

Type Description
List[str]

List[str]: The complete spiked VCF file content as a list of strings.

Source code in src/pheval/prepare/create_spiked_vcf.py
393
394
395
396
397
398
399
400
401
402
403
def construct_vcf(self, template_vcf_name: str) -> List[str]:
    """
    Construct the entire spiked VCF file by incorporating the spiked variants into the VCF.

    Args:
        template_vcf_name (str): Name of the template VCF file.

    Returns:
        List[str]: The complete spiked VCF file content as a list of strings.
    """
    return self.construct_header(self.construct_vcf_records(template_vcf_name))

construct_vcf_records(template_vcf_name)

Construct updated VCF records by inserting spiked variants into the correct positions within the VCF.

Parameters:

Name Type Description Default
template_vcf_name str

Name of the template VCF file.

required

Returns:

Type Description
List[str]

List[str]: Updated VCF records containing the spiked variants.

Source code in src/pheval/prepare/create_spiked_vcf.py
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
def construct_vcf_records(self, template_vcf_name: str) -> List[str]:
    """
    Construct updated VCF records by inserting spiked variants into the correct positions within the VCF.

    Args:
        template_vcf_name (str): Name of the template VCF file.

    Returns:
        List[str]: Updated VCF records containing the spiked variants.
    """
    updated_vcf_records = copy(self.vcf_contents)
    for variant in self.proband_causative_variants:
        variant_entry = self.construct_variant_entry(variant)
        matching_indices = [
            i
            for i, val in enumerate(updated_vcf_records)
            if val.split("\t")[0] == variant_entry[0]
            and int(val.split("\t")[1]) < int(variant_entry[1])
        ]
        if matching_indices:
            variant_entry_position = matching_indices[-1] + 1
        else:
            info_log.warning(
                f"Could not find entry position for {variant.variant.chrom}-{variant.variant.pos}-"
                f"{variant.variant.ref}-{variant.variant.alt} in {template_vcf_name}, "
                "inserting at end of VCF contents."
            )
            variant_entry_position = len(updated_vcf_records)
        updated_vcf_records.insert(variant_entry_position, "\t".join(variant_entry))
    return updated_vcf_records

VcfWriter

Class for writing VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
class VcfWriter:
    """Class for writing VCF file."""

    def __init__(
        self,
        vcf_contents: List[str],
        spiked_vcf_file_path: Path,
    ):
        """
        Initialise the VcfWriter class.

        Args:
            vcf_contents (List[str]): Contents of the VCF file to be written.
            spiked_vcf_file_path (Path): Path to the spiked VCF file to be created.
        """
        self.vcf_contents = vcf_contents
        self.spiked_vcf_file_path = spiked_vcf_file_path

    def write_gzip(self) -> None:
        """
        Write the VCF contents to a gzipped VCF file.
        """
        encoded_contents = [line.encode() for line in self.vcf_contents]
        with gzip.open(self.spiked_vcf_file_path, "wb") as f:
            for line in encoded_contents:
                f.write(line)
        f.close()

    def write_uncompressed(self) -> None:
        """
        Write the VCF contents to an uncompressed VCF file.
        """
        with open(self.spiked_vcf_file_path, "w") as file:
            file.writelines(self.vcf_contents)
        file.close()

    def write_vcf_file(self) -> None:
        """
        Write the VCF file based on compression type.

        Determines the file writing method based on the compression type of the spiked VCF file path.
        Writes the VCF contents to the corresponding file format (gzip or uncompressed).
        """
        self.write_gzip() if is_gzipped(self.spiked_vcf_file_path) else self.write_uncompressed()

__init__(vcf_contents, spiked_vcf_file_path)

Initialise the VcfWriter class.

Parameters:

Name Type Description Default
vcf_contents List[str]

Contents of the VCF file to be written.

required
spiked_vcf_file_path Path

Path to the spiked VCF file to be created.

required
Source code in src/pheval/prepare/create_spiked_vcf.py
409
410
411
412
413
414
415
416
417
418
419
420
421
422
def __init__(
    self,
    vcf_contents: List[str],
    spiked_vcf_file_path: Path,
):
    """
    Initialise the VcfWriter class.

    Args:
        vcf_contents (List[str]): Contents of the VCF file to be written.
        spiked_vcf_file_path (Path): Path to the spiked VCF file to be created.
    """
    self.vcf_contents = vcf_contents
    self.spiked_vcf_file_path = spiked_vcf_file_path

write_gzip()

Write the VCF contents to a gzipped VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py
424
425
426
427
428
429
430
431
432
def write_gzip(self) -> None:
    """
    Write the VCF contents to a gzipped VCF file.
    """
    encoded_contents = [line.encode() for line in self.vcf_contents]
    with gzip.open(self.spiked_vcf_file_path, "wb") as f:
        for line in encoded_contents:
            f.write(line)
    f.close()

write_uncompressed()

Write the VCF contents to an uncompressed VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py
434
435
436
437
438
439
440
def write_uncompressed(self) -> None:
    """
    Write the VCF contents to an uncompressed VCF file.
    """
    with open(self.spiked_vcf_file_path, "w") as file:
        file.writelines(self.vcf_contents)
    file.close()

write_vcf_file()

Write the VCF file based on compression type.

Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed).

Source code in src/pheval/prepare/create_spiked_vcf.py
442
443
444
445
446
447
448
449
def write_vcf_file(self) -> None:
    """
    Write the VCF file based on compression type.

    Determines the file writing method based on the compression type of the spiked VCF file path.
    Writes the VCF contents to the corresponding file format (gzip or uncompressed).
    """
    self.write_gzip() if is_gzipped(self.spiked_vcf_file_path) else self.write_uncompressed()

check_variant_assembly(proband_causative_variants, vcf_header, phenopacket_path)

Check the assembly of the variant assembly against the VCF.

Parameters:

Name Type Description Default
proband_causative_variants List[ProbandCausativeVariant]

A list of causative variants from the proband.

required
vcf_header VcfHeader

An instance of VcfHeader representing the VCF file's header.

required
phenopacket_path Path

The path to the Phenopacket file.

required

Raises:

Type Description
ValueError

If there are too many or incompatible genome assemblies found.

IncompatibleGenomeAssemblyError

If the assembly in the Phenopacket does not match the VCF assembly.

Source code in src/pheval/prepare/create_spiked_vcf.py
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
def check_variant_assembly(
    proband_causative_variants: list[ProbandCausativeVariant],
    vcf_header: VcfHeader,
    phenopacket_path: Path,
) -> None:
    """
    Check the assembly of the variant assembly against the VCF.

    Args:
        proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband.
        vcf_header (VcfHeader): An instance of VcfHeader representing the VCF file's header.
        phenopacket_path (Path): The path to the Phenopacket file.

    Raises:
        ValueError: If there are too many or incompatible genome assemblies found.
        IncompatibleGenomeAssemblyError: If the assembly in the Phenopacket does not match the VCF assembly.
    """
    compatible_genome_assembly = {"GRCh37", "hg19", "GRCh38", "hg38"}
    phenopacket_assembly = list({variant.assembly for variant in proband_causative_variants})
    if len(phenopacket_assembly) > 1:
        raise ValueError("Too many genome assemblies!")
    if phenopacket_assembly[0] not in compatible_genome_assembly:
        raise IncompatibleGenomeAssemblyError(phenopacket_assembly, phenopacket_path)
    if (
        phenopacket_assembly[0] in {"hg19", "GRCh37"}
        and vcf_header.assembly not in {"hg19", "GRCh37"}
    ) or (
        phenopacket_assembly[0] in {"hg38", "GRCh38"}
        and vcf_header.assembly not in {"hg38", "GRCh38"}
    ):
        raise IncompatibleGenomeAssemblyError(
            assembly=phenopacket_assembly, phenopacket=phenopacket_path
        )

create_spiked_vcf(output_dir, phenopacket_path, hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir, hg38_vcf_dir)

Create a spiked VCF for a Phenopacket.

Parameters:

Name Type Description Default
output_dir Path

The directory to store the generated spiked VCF file.

required
phenopacket_path Path

Path to the Phenopacket file.

required
hg19_template_vcf Path

Path to the hg19 template VCF file (optional).

required
hg38_template_vcf Path

Path to the hg38 template VCF file (optional).

required
hg19_vcf_dir Path

The directory containing the hg19 VCF files (optional).

required
hg38_vcf_dir Path

The directory containing the hg38 VCF files (optional).

required

Raises:

Type Description
InputError

If both hg19_template_vcf and hg38_template_vcf are None.

Source code in src/pheval/prepare/create_spiked_vcf.py
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
def create_spiked_vcf(
    output_dir: Path,
    phenopacket_path: Path,
    hg19_template_vcf: Path,
    hg38_template_vcf: Path,
    hg19_vcf_dir: Path,
    hg38_vcf_dir: Path,
) -> None:
    """
    Create a spiked VCF for a Phenopacket.

    Args:
        output_dir (Path): The directory to store the generated spiked VCF file.
        phenopacket_path (Path): Path to the Phenopacket file.
        hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
        hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
        hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
        hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).

    Raises:
        InputError: If both hg19_template_vcf and hg38_template_vcf are None.
    """
    if hg19_template_vcf is None and hg38_template_vcf is None:
        raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
    hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
    hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
    spike_and_update_phenopacket(
        hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path
    )

create_spiked_vcfs(output_dir, phenopacket_dir, hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir, hg38_vcf_dir)

Create a spiked VCF for a directory of Phenopackets.

Parameters:

Name Type Description Default
output_dir Path

The directory to store the generated spiked VCF file.

required
phenopacket_dir Path

Path to the Phenopacket directory.

required
hg19_template_vcf Path

Path to the template hg19 VCF file (optional).

required
hg38_template_vcf Path

Path to the template hg19 VCF file (optional).

required
hg19_vcf_dir Path

The directory containing the hg19 VCF files (optional).

required
hg38_vcf_dir Path

The directory containing the hg38 VCF files (optional).

required

Raises:

Type Description
InputError

If both hg19_template_vcf and hg38_template_vcf are None.

Source code in src/pheval/prepare/create_spiked_vcf.py
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
def create_spiked_vcfs(
    output_dir: Path,
    phenopacket_dir: Path,
    hg19_template_vcf: Path,
    hg38_template_vcf: Path,
    hg19_vcf_dir: Path,
    hg38_vcf_dir: Path,
) -> None:
    """
    Create a spiked VCF for a directory of Phenopackets.

    Args:
        output_dir (Path): The directory to store the generated spiked VCF file.
        phenopacket_dir (Path): Path to the Phenopacket directory.
        hg19_template_vcf (Path): Path to the template hg19 VCF file (optional).
        hg38_template_vcf (Path): Path to the template hg19 VCF file (optional).
        hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
        hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).

    Raises:
        InputError: If both hg19_template_vcf and hg38_template_vcf are None.
    """
    if (
        hg19_template_vcf is None
        and hg38_template_vcf is None
        and hg19_vcf_dir is None
        and hg38_vcf_dir is None
    ):
        raise InputError("Need to specify a VCF!")
    hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
    hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
    for phenopacket_path in files_with_suffix(phenopacket_dir, ".json"):
        spike_and_update_phenopacket(
            hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path
        )

generate_spiked_vcf_file(output_dir, phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir)

Write spiked VCF contents to a new file.

Parameters:

Name Type Description Default
output_dir Path

Path to the directory to store the generated file.

required
phenopacket Union[Phenopacket, Family]

Phenopacket or Family containing causative variants.

required
phenopacket_path Path

Path to the Phenopacket file.

required
hg19_vcf_info VcfFile

VCF file info for hg19 template vcf.

required
hg38_vcf_info VcfFile

VCF file info for hg38 template vcf.

required
hg19_vcf_dir Path

The directory containing the hg19 VCF files.

required
hg38_vcf_dir Path

The directory containing the hg38 VCF files.

required

Returns:

Name Type Description
File File

The generated File object representing the newly created spiked VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
def generate_spiked_vcf_file(
    output_dir: Path,
    phenopacket: Union[Phenopacket, Family],
    phenopacket_path: Path,
    hg19_vcf_info: VcfFile,
    hg38_vcf_info: VcfFile,
    hg19_vcf_dir: Path,
    hg38_vcf_dir: Path,
) -> File:
    """
    Write spiked VCF contents to a new file.

    Args:
        output_dir (Path): Path to the directory to store the generated file.
        phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants.
        phenopacket_path (Path): Path to the Phenopacket file.
        hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
        hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
        hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
        hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
    Returns:
        File: The generated File object representing the newly created spiked VCF file.
    """
    output_dir.mkdir(exist_ok=True)
    info_log.info(f" Created a directory {output_dir}")
    vcf_assembly, spiked_vcf = spike_vcf_contents(
        phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir
    )
    spiked_vcf_path = output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
    VcfWriter(spiked_vcf, spiked_vcf_path).write_vcf_file()
    return File(
        uri=urllib.parse.unquote(spiked_vcf_path.as_uri()),
        file_attributes={"fileFormat": "vcf", "genomeAssembly": vcf_assembly},
    )

read_vcf(vcf_file)

Read the contents of a VCF file into memory, handling both uncompressed and gzipped files.

Parameters:

Name Type Description Default
vcf_file Path

The path to the VCF file to be read.

required

Returns:

Type Description
List[str]

List[str]: A list containing the lines of the VCF file.

Source code in src/pheval/prepare/create_spiked_vcf.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def read_vcf(vcf_file: Path) -> List[str]:
    """
    Read the contents of a VCF file into memory, handling both uncompressed and gzipped files.

    Args:
        vcf_file (Path): The path to the VCF file to be read.

    Returns:
        List[str]: A list containing the lines of the VCF file.
    """
    open_fn = gzip.open if is_gzipped(vcf_file) else open
    vcf = open_fn(vcf_file)
    vcf_contents = (
        [line.decode() for line in vcf.readlines()] if is_gzipped(vcf_file) else vcf.readlines()
    )
    vcf.close()
    return vcf_contents

select_vcf_template(phenopacket_path, proband_causative_variants, hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir)

Select the appropriate VCF template based on the assembly information of the proband causative variants.

Parameters:

Name Type Description Default
phenopacket_path Path

The path to the Phenopacket file.

required
proband_causative_variants List[ProbandCausativeVariant]

A list of causative variants from the proband.

required
hg19_vcf_info VcfFile

VCF file info for hg19 template vcf.

required
hg38_vcf_info VcfFile

CF file info for hg38 template vcf.

required
hg19_vcf_dir Path

The directory containing the hg19 VCF files.

required
hg38_vcf_dir Path

The directory containing the hg38 VCF files.

required

Returns:

Name Type Description
VcfFile VcfFile

The selected VCF template file based on the assembly information of the proband causative variants.

Source code in src/pheval/prepare/create_spiked_vcf.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
def select_vcf_template(
    phenopacket_path: Path,
    proband_causative_variants: List[ProbandCausativeVariant],
    hg19_vcf_info: VcfFile,
    hg38_vcf_info: VcfFile,
    hg19_vcf_dir: Path,
    hg38_vcf_dir: Path,
) -> VcfFile:
    """
    Select the appropriate VCF template based on the assembly information of the proband causative variants.

    Args:
        phenopacket_path (Path): The path to the Phenopacket file.
        proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband.
        hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
        hg38_vcf_info (VcfFile): CF file info for hg38 template vcf.
        hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
        hg38_vcf_dir (Path): The directory containing the hg38 VCF files.

    Returns:
        VcfFile: The selected VCF template file based on the assembly information of the proband causative variants.

    """
    if proband_causative_variants[0].assembly in ["hg19", "GRCh37"]:
        if hg19_vcf_info:
            return hg19_vcf_info
        elif hg19_vcf_dir:
            return VcfFile.populate_fields(random.choice(all_files(hg19_vcf_dir)))
        else:
            raise InputError("Must specify hg19 template VCF!")
    elif proband_causative_variants[0].assembly in ["hg38", "GRCh38"]:
        if hg38_vcf_info:
            return hg38_vcf_info
        elif hg38_vcf_dir:
            return VcfFile.populate_fields(random.choice(all_files(hg38_vcf_dir)))
        else:
            raise InputError("Must specify hg38 template VCF!")
    else:
        raise IncompatibleGenomeAssemblyError(
            proband_causative_variants[0].assembly, phenopacket_path
        )

spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path)

Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket accordingly, and write the updated Phenopacket to the specified output directory.

Parameters:

Name Type Description Default
hg19_vcf_info VcfFile

VCF file info for hg19 template vcf.

required
hg38_vcf_info VcfFile

VCF file info for hg38 template vcf.

required
hg19_vcf_dir Path

The directory containing the hg19 VCF files.

required
hg38_vcf_dir Path

The directory containing the hg38 VCF files.

required
output_dir Path

Directory where the updated Phenopacket will be saved.

required
phenopacket_path Path

Path to the original Phenopacket file.

required

Returns:

Type Description
None

None

Source code in src/pheval/prepare/create_spiked_vcf.py
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
def spike_and_update_phenopacket(
    hg19_vcf_info: VcfFile,
    hg38_vcf_info: VcfFile,
    hg19_vcf_dir: Path,
    hg38_vcf_dir: Path,
    output_dir: Path,
    phenopacket_path: Path,
) -> None:
    """
    Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket
    accordingly, and write the updated Phenopacket to the specified output directory.

    Args:
        hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
        hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
        hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
        hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
        output_dir (Path): Directory where the updated Phenopacket will be saved.
        phenopacket_path (Path): Path to the original Phenopacket file.

    Returns:
        None
    """
    phenopacket = phenopacket_reader(phenopacket_path)
    spiked_vcf_file_message = generate_spiked_vcf_file(
        output_dir,
        phenopacket,
        phenopacket_path,
        hg19_vcf_info,
        hg38_vcf_info,
        hg19_vcf_dir,
        hg38_vcf_dir,
    )
    updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
        spiked_vcf_file_message
    )
    write_phenopacket(updated_phenopacket, phenopacket_path)

spike_vcf_contents(phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir)

Spike VCF records with variants obtained from a Phenopacket or Family.

Parameters:

Name Type Description Default
phenopacket Union[Phenopacket, Family]

Phenopacket or Family containing causative variants.

required
phenopacket_path Path

Path to the Phenopacket file.

required
hg19_vcf_info VcfFile

VCF file info for hg19 template vcf.

required
hg38_vcf_info VcfFile

VCF file info for hg38 template vcf.

required
hg19_vcf_dir Path

The directory containing the hg19 VCF files.

required
hg38_vcf_dir Path

The directory containing the hg38 VCF files.

required

Returns:

Type Description
tuple[str, List[str]]

A tuple containing: assembly (str): The genome assembly information extracted from VCF header. modified_vcf_contents (List[str]): Modified VCF records with spiked variants.

Source code in src/pheval/prepare/create_spiked_vcf.py
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
def spike_vcf_contents(
    phenopacket: Union[Phenopacket, Family],
    phenopacket_path: Path,
    hg19_vcf_info: VcfFile,
    hg38_vcf_info: VcfFile,
    hg19_vcf_dir: Path,
    hg38_vcf_dir: Path,
) -> tuple[str, List[str]]:
    """
    Spike VCF records with variants obtained from a Phenopacket or Family.

    Args:
        phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants.
        phenopacket_path (Path): Path to the Phenopacket file.
        hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
        hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
        hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
        hg38_vcf_dir (Path): The directory containing the hg38 VCF files.

    Returns:
        A tuple containing:
            assembly (str): The genome assembly information extracted from VCF header.
            modified_vcf_contents (List[str]): Modified VCF records with spiked variants.
    """
    phenopacket_causative_variants = PhenopacketUtil(phenopacket).causative_variants()
    chosen_template_vcf = select_vcf_template(
        phenopacket_path,
        phenopacket_causative_variants,
        hg19_vcf_info,
        hg38_vcf_info,
        hg19_vcf_dir,
        hg38_vcf_dir,
    )
    check_variant_assembly(
        phenopacket_causative_variants, chosen_template_vcf.vcf_header, phenopacket_path
    )
    return (
        chosen_template_vcf.vcf_header.assembly,
        VcfSpiker(
            chosen_template_vcf.vcf_contents,
            phenopacket_causative_variants,
            chosen_template_vcf.vcf_header,
        ).construct_vcf(chosen_template_vcf.vcf_file_name),
    )

spike_vcfs(output_dir, phenopacket_path, phenopacket_dir, hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir, hg38_vcf_dir)

Create spiked VCF from either a Phenopacket or a Phenopacket directory.

Parameters:

Name Type Description Default
output_dir Path

The directory to store the generated spiked VCF file(s).

required
phenopacket_path Path

Path to a single Phenopacket file (optional).

required
phenopacket_dir Path

Path to a directory containing Phenopacket files (optional).

required
hg19_template_vcf Path

Path to the hg19 template VCF file (optional).

required
hg38_template_vcf Path

Path to the hg38 template VCF file (optional).

required
hg19_vcf_dir Path

The directory containing the hg19 VCF files (optional).

required
hg38_vcf_dir Path

The directory containing the hg38 VCF files (optional).

required
Source code in src/pheval/prepare/create_spiked_vcf.py
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
def spike_vcfs(
    output_dir: Path,
    phenopacket_path: Path,
    phenopacket_dir: Path,
    hg19_template_vcf: Path,
    hg38_template_vcf: Path,
    hg19_vcf_dir: Path,
    hg38_vcf_dir: Path,
) -> None:
    """
    Create spiked VCF from either a Phenopacket or a Phenopacket directory.

    Args:
        output_dir (Path): The directory to store the generated spiked VCF file(s).
        phenopacket_path (Path): Path to a single Phenopacket file (optional).
        phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional).
        hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
        hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
        hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
        hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).
    """
    if phenopacket_path is not None:
        create_spiked_vcf(
            output_dir,
            phenopacket_path,
            hg19_template_vcf,
            hg38_template_vcf,
            hg19_vcf_dir,
            hg38_vcf_dir,
        )
    elif phenopacket_dir is not None:
        create_spiked_vcfs(
            output_dir,
            phenopacket_dir,
            hg19_template_vcf,
            hg38_template_vcf,
            hg19_vcf_dir,
            hg38_vcf_dir,
        )