Source code for dae.enrichment_tool.build_coding_length_enrichment_background

import argparse
import logging
import sys
from collections import defaultdict
from typing import cast

import pandas as pd

from dae.genomic_resources.gene_models import (
    GeneModels,
    build_gene_models_from_resource,
)
from dae.genomic_resources.repository import GenomicResourceRepo
from dae.genomic_resources.repository_factory import (
    build_genomic_resource_repository,
)
from dae.utils.regions import BedRegion, collapse
from dae.utils.verbosity_configuration import VerbosityConfiguration

logger = logging.getLogger(__name__)


[docs] def cli(argv: list[str] | None = None) -> None: """Command line tool to create coding length enrichment background.""" if argv is None: argv = sys.argv[1:] description = ( "Command line tool to create coding length enrichment " "background" ) parser = argparse.ArgumentParser(description=description) VerbosityConfiguration.set_arguments(parser) parser.add_argument( "--grr", "--definition", "-g", type=str, default=None, help="Path to an extra GRR definition file. This GRR will be loaded" "in a group alongside the local one.") parser.add_argument( dest="gene_models", type=str, help="Specifies the resource ID of gene models to use") parser.add_argument( "-o", "--output", type=str, default="coding_len_background.tsv", help="Output filename to store the create coding length background") args = parser.parse_args(argv) VerbosityConfiguration.set(args) grr: GenomicResourceRepo = \ build_genomic_resource_repository(file_name=args.grr) resource = grr.get_resource(args.gene_models) gene_models = build_gene_models_from_resource(resource) gene_models.load() background_df = build_coding_length_background(gene_models) background_df.to_csv(args.output, index=False, sep="\t")
[docs] def build_coding_length_background(gene_models: GeneModels) -> pd.DataFrame: """Build coding length enrichment background data.""" genes_coding_regions = defaultdict(list) for gm in gene_models.transcript_models.values(): if gm.chrom.endswith("alt") or gm.chrom.endswith("fix"): continue genes_coding_regions[gm.gene].extend(gm.cds_regions()) genes_regions = {} for gene, regions in genes_coding_regions.items(): genes_regions[gene] = collapse(regions) genes_lengths = {} for gene, regions in genes_regions.items(): gene_len = sum(len(cast(BedRegion, r)) for r in regions) genes_lengths[gene.upper()] = gene_len return pd.DataFrame.from_records( list(genes_lengths.items()), columns=["gene", "gene_weight"])