Source code for dae.genomic_resources.implementations.gene_models_impl

from __future__ import annotations

import json
import logging
import textwrap
from collections import defaultdict
from dataclasses import asdict, dataclass
from functools import lru_cache
from typing import Any

import yaml
from jinja2 import Template

from dae.genomic_resources import GenomicResource
from dae.genomic_resources.gene_models import (
    TranscriptModel,
    build_gene_models_from_resource,
)
from dae.genomic_resources.resource_implementation import (
    GenomicResourceImplementation,
    InfoImplementationMixin,
    ResourceStatistics,
)
from dae.task_graph.graph import Task, TaskGraph

logger = logging.getLogger(__name__)


[docs] @dataclass class StatisticsData: """Class for storing gene models statistics.""" transcript_number: int protein_coding_transcript_number: int gene_number: int protein_coding_gene_number: int
[docs] class GeneModelsStatistics(ResourceStatistics): """Class for accessing reference genome statistics.""" def __init__( self, resource_id: str, chromosome_count: int, global_statistic: StatisticsData, chrom_statistics: dict[str, StatisticsData]): super().__init__(resource_id) self.chromosome_count = chromosome_count self.global_statistic = global_statistic self.chrom_statistics = chrom_statistics
[docs] def serialize(self) -> str: """Serialize gene models statistics.""" result: dict[str, Any] = {} result["resource_id"] = self.resource_id result["chromosome_count"] = self.chromosome_count result["global"] = asdict(self.global_statistic) result["chromosomes"] = { chrom: asdict(stat) for chrom, stat in self.chrom_statistics.items() } return yaml.dump(result, sort_keys=False)
[docs] @staticmethod def deserialize(data: str) -> GeneModelsStatistics: """Deserialize gene models statistics.""" result = yaml.safe_load(data) return GeneModelsStatistics( result["resource_id"], result["chromosome_count"], StatisticsData(**result["global"]), { chrom: StatisticsData(**stat) for chrom, stat in result["chromosomes"].items() }, )
[docs] class GeneModelsImpl( GenomicResourceImplementation, InfoImplementationMixin, ): """Provides class for gene models.""" def __init__(self, resource: GenomicResource): super().__init__(resource) self.gene_models = build_gene_models_from_resource(resource) @property def files(self) -> set[str]: res = set() res.add(self.resource.get_config()["filename"]) gene_mapping_filename = self.resource.get_config().get("gene_mapping") if gene_mapping_filename is not None: res.add(gene_mapping_filename) return res
[docs] def get_template(self) -> Template: return Template(textwrap.dedent(""" {% extends base %} {% block extra_styles %} #chromosomes-table { border-collapse: separate; border-spacing: 0; } #chromosomes-table th { border-top: 1px solid; border-bottom: 1px solid; border-right: 1px solid; } #chromosomes-table td { border-bottom: 1px solid; border-right: 1px solid; } #chromosomes-table th:first-child, #chromosomes-table td:first-child { border-left: 1px solid; } #chromosomes-table thead tr:nth-of-type(2) th { border-top: none; } #chromosomes-table thead { position: sticky; top: 0; background-color: white; } {% endblock %} {% block content %} <h1>Configuration</h1> Gene models file: <p><a href="{{ data.config.filename }}"> {{ data.config.filename }} </a></p> <p>Format: {{ data.config.format }}</p> <h1>Statistics</h1> <h2>Chromosome statistics</h2> <div style="max-height: 50%; overflow-y: auto; width: fit-content"> <table id="chromosomes-table"> <thead> <tr> <th></th> <th>Transcript number</th> <th>Protein coding transcript number</th> <th>Gene number</th> <th>Protein coding gene number</th> </tr> <tr> <th>Global</th> <td>{{ '{:,}'.format(data.stats.global_statistic.transcript_number)}}</td> <td>{{ '{:,}'.format(data.stats.global_statistic.protein_coding_transcript_number)}}</td> <td>{{ '{:,}'.format(data.stats.global_statistic.gene_number)}}</td> <td>{{ '{:,}'.format(data.stats.global_statistic.protein_coding_gene_number)}}</td> </tr> </thead> {% for chrom, stat in data.stats.chrom_statistics.items() %} <tr> <td>{{ chrom }}</td> <td>{{ '{:,}'.format(stat.transcript_number)}}</td> <td>{{ '{:,}'.format(stat.protein_coding_transcript_number)}}</td> <td>{{ '{:,}'.format(stat.gene_number)}}</td> <td>{{ '{:,}'.format(stat.protein_coding_gene_number)}}</td> </tr> {% endfor %} </table> </div> {% endblock %} """))
def _get_template_data(self) -> dict[str, Any]: return {"config": self.config, "stats": self.get_statistics()}
[docs] def get_info(self, **kwargs: Any) -> str: # noqa: ARG002 return InfoImplementationMixin.get_info(self)
[docs] def calc_info_hash(self) -> bytes: return b"placeholder"
[docs] def calc_statistics_hash(self) -> bytes: manifest = self.resource.get_manifest() return json.dumps({ "config": { "format": self.config.get("format"), }, "files_md5": { file_name: manifest[file_name].md5 for file_name in sorted(self.files)}, }, indent=2).encode()
[docs] def add_statistics_build_tasks( self, task_graph: TaskGraph, **kwargs: Any, # noqa: ARG002 ) -> list[Task]: task = task_graph.create_task( f"{self.resource_id}_cals_stats", self._do_statistics, [self.resource], []) return [task]
@staticmethod def _do_statistics(resource: GenomicResource) -> GeneModelsStatistics: gene_models = build_gene_models_from_resource(resource).load() coding_transcripts = [ tm for tm in gene_models.transcript_models.values() if tm.is_coding() ] global_stats = StatisticsData( len(gene_models.transcript_models), len(coding_transcripts), len(gene_models.gene_names()), len({tm.gene for tm in coding_transcripts}), ) tm_by_chrom: dict[str, list[TranscriptModel]] = defaultdict(list) for trm in gene_models.transcript_models.values(): tm_by_chrom[trm.chrom].append(trm) chromosome_stats: dict[str, StatisticsData] = {} for chrom, tms in tm_by_chrom.items(): chrom_coding_transcripts = [ tm for tm in tms if tm.is_coding() ] chromosome_stats[chrom] = StatisticsData( len(tms), len(chrom_coding_transcripts), len({tm.gene for tm in tms}), len({tm.gene for tm in chrom_coding_transcripts}), ) gene_models_stats = GeneModelsStatistics( resource.resource_id, len(tm_by_chrom), global_stats, chromosome_stats, ) with resource.proto.open_raw_file( resource, "statistics/stats.yaml", "wt") as stats_file: stats_file.write(gene_models_stats.serialize()) return gene_models_stats
[docs] @lru_cache(maxsize=64) # noqa: B019 def get_statistics(self) -> GeneModelsStatistics | None: # type: ignore try: with self.resource.proto.open_raw_file( self.resource, "statistics/stats.yaml", "rt") as stats_file: return GeneModelsStatistics.deserialize( stats_file.read()) except FileExistsError: return None