Source code for dae.tools.dae_liftover

import argparse
import logging
import sys
import textwrap
from collections import Counter, defaultdict
from typing import Any, cast

from dae.annotation.annotatable import VCFAllele
from dae.annotation.annotation_factory import load_pipeline_from_yaml
from dae.genomic_resources.reference_genome import (
    build_reference_genome_from_resource,
)
from dae.gpf_instance.gpf_instance import GPFInstance
from dae.pedigrees.loader import FamiliesLoader
from dae.tools.stats_liftover import save_liftover_stats
from dae.utils.regions import Region
from dae.utils.variant_utils import mat2str
from dae.utils.verbosity_configuration import VerbosityConfiguration
from dae.variants.family_variant import FamilyAllele
from dae.variants.variant import VariantDetails
from dae.variants_loaders.dae.loader import DaeTransmittedLoader

logger = logging.getLogger("dae_liftover")


[docs] def parse_cli_arguments() -> argparse.ArgumentParser: """Create CLI parser.""" parser = argparse.ArgumentParser( description="liftover denovo variants to hg38") VerbosityConfiguration.set_arguments(parser) FamiliesLoader.cli_arguments(parser) DaeTransmittedLoader.cli_arguments(parser) parser.add_argument( "-c", "--chain", help="chain resource id", default="liftover/hg19ToHg38") parser.add_argument( "-t", "--target-genome", help="target genome", default="hg38/genomes/GRCh38-hg38") parser.add_argument( "--target-gene-models", "--tgm", help="target gene models", default="hg38/gene_models/refGene_v20170601") parser.add_argument( "-s", "--source-genome", help="source genome", default="hg19/genomes/GATK_ResourceBundle_5777_b37_phiX174") parser.add_argument( "--source-gene-models", "--sgm", help="source gene models", default="hg19/gene_models/refGene_v20190211") parser.add_argument( "--stats", help="filename to store liftover statistics", default="stats.txt", ) parser.add_argument( "-o", "--output-prefix", help="output filename prefix", default="transmitted") parser.add_argument( "--region", type=str, dest="region", metavar="region", default=None, help="region to convert [default: None] " "ex. chr1:1-10000. ", ) return parser
[docs] def main( argv: list[str] | None = None, gpf_instance: GPFInstance | None = None) -> None: """Liftover dae variants tool main function.""" # pylint: disable=too-many-locals,too-many-statements if argv is None: argv = sys.argv[1:] if gpf_instance is None: gpf_instance = GPFInstance.build() parser = parse_cli_arguments() assert argv is not None args = parser.parse_args(argv) VerbosityConfiguration.set(args) grr = gpf_instance.grr source_genome = build_reference_genome_from_resource( grr.get_resource(args.source_genome)) assert source_genome is not None source_genome.open() families_filenames, families_params = \ FamiliesLoader.parse_cli_arguments(args) families_filename = families_filenames[0] families_loader = FamiliesLoader( families_filename, **families_params, ) families = families_loader.load() variants_filenames, variants_params = \ DaeTransmittedLoader.parse_cli_arguments(args) variants_loader = DaeTransmittedLoader( families, variants_filenames, # type: ignore params=variants_params, genome=source_genome, ) summary_filename = f"{args.output_prefix}.txt" toomany_filename = f"{args.output_prefix}-TOOMANY.txt" if args.region is not None: region = Region.from_str(args.region) logger.info("resetting regions (region): %s", region) variants_loader.reset_regions([region]) summary_filename = f"{args.output_prefix}-{region}.txt" toomany_filename = f"{args.output_prefix}-TOOMANY-{region}.txt" pipeline_config = textwrap.dedent( f""" - effect_annotator: gene_models: {args.source_gene_models} genome: {args.source_genome} attributes: - source: "worst_effect" name: "source_worst_effect" - source: "gene_effects" name: "source_gene_effects" - source: "effect_details" name: "source_effect_details" - liftover_annotator: chain: {args.chain} source_genome: {args.source_genome} target_genome: {args.target_genome} attributes: - source: liftover_annotatable name: target_annotatable - effect_annotator: gene_models: {args.target_gene_models} genome: {args.target_genome} input_annotatable: target_annotatable attributes: - source: "worst_effect" name: "target_worst_effect" - source: "gene_effects" name: "target_gene_effects" - source: "effect_details" name: "target_effect_details" """, ) pipeline = load_pipeline_from_yaml(pipeline_config, gpf_instance.grr) pipeline.open() target_stats: dict[str, Any] = defaultdict(Counter) with open(summary_filename, "wt") as output_summary, \ open(toomany_filename, "wt") as output_toomany: summary_header = [ "#chr", "position", "variant", "familyData", "all.nParCalled", "all.prcntParCalled", "all.nAltAlls", "all.altFreq", ] toomany_header = [ "#chr", "position", "variant", "familyData", ] output_summary.write("\t".join(summary_header)) output_summary.write("\n") output_toomany.write("\t".join(toomany_header)) output_toomany.write("\n") for sv, fvs in variants_loader.full_variants_iterator(): assert len(sv.alt_alleles) == 1 aa = sv.alt_alleles[0] annotatable: VCFAllele = cast(VCFAllele, aa.get_annotatable()) result = pipeline.annotate(annotatable) liftover_annotatable: VCFAllele = \ cast(VCFAllele, result.get("target_annotatable")) source_worst_effect = cast(str, result.get("source_worst_effect")) target_worst_effect = cast(str, result.get("target_worst_effect")) target_stats[source_worst_effect]["source"] += 1 if liftover_annotatable is None: logger.error("can't liftover %s", aa) target_stats[source_worst_effect]["no_liftover"] += 1 continue liftover_cshl_variant = VariantDetails.from_vcf( liftover_annotatable.chrom, liftover_annotatable.pos, liftover_annotatable.ref, liftover_annotatable.alt) target_stats[source_worst_effect][target_worst_effect] += 1 summary_line = [ liftover_cshl_variant.chrom, str(liftover_cshl_variant.cshl_position), liftover_cshl_variant.cshl_variant, ] frequency_data = [ str(aa.attributes.get("af_parents_called_count", "")), str(aa.attributes.get("af_parents_called_percent", "")), str(aa.attributes.get("af_allele_count", "")), str(aa.attributes.get("af_allele_freq", "")), ] toomany_line = [ liftover_cshl_variant.chrom, str(liftover_cshl_variant.cshl_position), liftover_cshl_variant.cshl_variant, ] families_data = [] for fv in fvs: fa = cast(FamilyAllele, fv.alt_alleles[0]) fdata = [ fa.family_id, mat2str(fa.best_state), mat2str( fa.family_attributes["read_counts"], col_sep=" "), ] families_data.append(":".join(fdata)) if len(families_data) < 20: summary_line.append(";".join(families_data)) summary_line.extend(frequency_data) output_summary.write("\t".join(summary_line)) output_summary.write("\n") else: summary_line.append("TOOMANY") summary_line.extend(frequency_data) output_summary.write("\t".join(summary_line)) output_summary.write("\n") toomany_line.append(";".join(families_data)) output_toomany.write("\t".join(toomany_line)) output_toomany.write("\n") save_liftover_stats(target_stats, args.stats)