import argparse
import logging
import sys
import tempfile
from contextlib import closing
import pysam
from dae.annotation.context import CLIAnnotationContext
from dae.annotation.liftover_annotator import (
basic_liftover_variant,
bcf_liftover_variant,
)
from dae.genomic_resources.genomic_context import get_genomic_context
from dae.genomic_resources.liftover_chain import (
LiftoverChain,
build_liftover_chain_from_resource,
)
from dae.genomic_resources.reference_genome import (
ReferenceGenome,
build_reference_genome_from_resource,
)
from dae.genomic_resources.repository import GenomicResourceRepo
from dae.utils.regions import Region
from dae.utils.verbosity_configuration import VerbosityConfiguration
logger = logging.getLogger("vcf_liftover")
[docs]
def parse_cli_arguments() -> argparse.ArgumentParser:
"""Create CLI parser."""
parser = argparse.ArgumentParser(
description="liftover VCF variants")
CLIAnnotationContext.add_context_arguments(parser)
VerbosityConfiguration.set_arguments(parser)
parser.add_argument(
"vcffile", help="vcf file to liftover",
)
parser.add_argument(
"-c", "--chain", help="chain resource id",
default="liftover/hg19ToHg38")
parser.add_argument(
"-t", "--target-genome", help="target genome",
default="hg38/genomes/GRCh38-hg38")
parser.add_argument(
"-s", "--source-genome", help="source genome",
default="hg19/genomes/GATK_ResourceBundle_5777_b37_phiX174")
parser.add_argument(
"-o", "--output-prefix", help="output filename prefix",
default="transmitted")
parser.add_argument(
"--region",
type=str,
dest="region",
metavar="region",
default=None,
help="region to convert [default: None] "
"ex. chr1:1-10000. ",
)
parser.add_argument(
"--mode",
type=str,
dest="mode",
metavar="mode",
default="bcf_liftover",
help="mode to use for liftover: 'bcf_liftover' or 'basic_liftover'",
)
return parser
def _construct_reference_genome(
grr: GenomicResourceRepo,
resource_id: str,
) -> ReferenceGenome:
res = grr.get_resource(resource_id)
if res is None:
raise ValueError(f"reference genome {resource_id} not found")
genome = build_reference_genome_from_resource(res)
if genome is None:
raise ValueError(f"reference genome {resource_id} not found")
return genome.open()
def _construct_liftover_chain(
grr: GenomicResourceRepo,
resource_id: str,
) -> LiftoverChain:
res = grr.get_resource(resource_id)
if res is None:
raise ValueError(f"liftover chain {resource_id} not found")
chain = build_liftover_chain_from_resource(res)
if chain is None:
raise ValueError(f"liftover chain {resource_id} not found")
return chain.open()
[docs]
def main(
argv: list[str] | None = None,
grr: GenomicResourceRepo | None = None,
) -> None:
"""Liftover dae variants tool main function."""
# pylint: disable=too-many-locals,too-many-statements
if argv is None:
argv = sys.argv[1:]
parser = parse_cli_arguments()
args = parser.parse_args(argv)
VerbosityConfiguration.set(args)
if grr is None:
context = get_genomic_context()
grr = context.get_genomic_resources_repository()
if grr is None:
raise ValueError("genomic resources repository not found")
source_genome = _construct_reference_genome(grr, args.source_genome)
target_genome = _construct_reference_genome(grr, args.target_genome)
chain = _construct_liftover_chain(grr, args.chain)
output_filename = f"{args.output_prefix}.vcf"
region = None
if args.region is not None:
region = str(Region.from_str(args.region))
output_filename = f"{args.output_prefix}-{region}.vcf"
with closing(pysam.VariantFile(args.vcffile)) as infile:
output_header = liftover_header(
args.vcffile,
source_genome,
target_genome)
if args.mode == "bcf_liftover":
_liftover_variant_func = bcf_liftover_variant
elif args.mode == "basic_liftover":
_liftover_variant_func = basic_liftover_variant
else:
raise ValueError(f"Invalid mode: {args.mode}")
with closing(pysam.VariantFile(args.vcffile)) as infile, \
closing(pysam.VariantFile(
output_filename, "w", header=output_header)) as outfile:
for vcf_variant in infile.fetch(region=region):
if vcf_variant.alts is None:
logger.warning("skipping variant without alts: %s", vcf_variant)
continue
if vcf_variant.ref is None:
logger.warning("skipping variant without ref: %s", vcf_variant)
continue
lo_variant = _liftover_variant_func(
vcf_variant.chrom, vcf_variant.pos,
vcf_variant.ref, list(vcf_variant.alts),
chain, source_genome, target_genome,
)
if lo_variant is None:
logger.warning(
"skipping variant without liftover: %s",
report_vcf_variant(vcf_variant))
continue
vcf_variant.translate(output_header)
chrom, pos, ref, alts = lo_variant
try:
vcf_variant.contig = chrom
vcf_variant.pos = pos
vcf_variant.ref = ref
vcf_variant.alts = tuple(alts)
outfile.write(vcf_variant)
except ValueError:
logger.exception(
"skipping variant with invalid liftover: %s liftover to %s",
report_vcf_variant(vcf_variant), report_variant(lo_variant))
raise
[docs]
def report_vcf_variant(vcf_variant: pysam.VariantRecord) -> str:
"""Report VCF variant."""
return (
f"({vcf_variant.chrom}:{vcf_variant.pos} "
f"{vcf_variant.ref} > "
f"{','.join(vcf_variant.alts) if vcf_variant.alts else '.'})"
)
[docs]
def report_alleles(alleles: list[tuple[str, int, str, str]]) -> str:
"""Report alleles."""
if not alleles:
return "(none)"
return ";".join([
f"({chrom}:{pos} {ref} > {alt})"
for chrom, pos, ref, alt in alleles
])
[docs]
def report_variant(variant: tuple[str, int, str, list[str]] | None) -> str:
"""Report variant."""
if not variant:
return "(none)"
chrom, pos, ref, alts = variant
s_alts = ",".join(alts)
return f"({chrom}:{pos} {ref} > {s_alts})"