Source code for gain.gene_sets.implementations.gene_sets_impl

import copy
import json
from collections import Counter
from typing import Any, ClassVar

from markdown2 import markdown

from gain.gene_sets.gene_set import build_gene_set_collection_from_resource
from gain.genomic_resources.histogram import (
    CategoricalHistogram,
    CategoricalHistogramConfig,
    NumberHistogram,
    NumberHistogramConfig,
    plot_histogram,
)
from gain.genomic_resources.repository import (
    GenomicResource,
)
from gain.genomic_resources.resource_implementation import (
    GenomicResourceImplementation,
    InfoImplementationMixin,
    ResourceConfigValidationMixin,
)
from gain.task_graph.graph import TaskDesc, TaskGraph


[docs] class GeneSetCollectionImpl( GenomicResourceImplementation, InfoImplementationMixin, ResourceConfigValidationMixin, ): """Gene sets collection resource implementations.""" def __init__(self, resource: GenomicResource) -> None: super().__init__(resource) config = resource.get_config() if config is None: raise ValueError( f"genomic resource {resource.resource_id} not configured") self.gene_set_collection = build_gene_set_collection_from_resource( resource, ) template_name: ClassVar[str] = "gene_set_collection.jinja" styles_template_name: ClassVar[str] = "gene_set_collection_styles.jinja" def _compute_and_save_gene_statistics(self) -> dict: all_gene_sets = self.gene_set_collection.get_all_gene_sets() unique_genes = set() for gene_set in all_gene_sets: unique_genes.update(gene_set.syms) result = { "number_of_gene_sets": len(all_gene_sets), "number_of_unique_genes": len(unique_genes), } with self.resource.proto.open_raw_file( self.resource, "statistics/gene_collection_count_statistics.json", "wt", ) as statistics_file: json.dump(result, statistics_file) return result def _compute_and_save_gene_sets_list_statistics(self) -> list[dict]: all_gene_sets = self.gene_set_collection.get_all_gene_sets() unique_genes = set() for gene_set in all_gene_sets: unique_genes.update(gene_set.syms) result = [ {"name": gs.name, "count": gs.count, "desc": gs.desc or gs.name} for gs in sorted( all_gene_sets, key=lambda gs: (-gs.count, gs.name)) ] with self.resource.proto.open_raw_file( self.resource, "statistics/gene_sets_list_statistics.json", "wt", ) as statistics_file: json.dump(result, statistics_file) return result def _get_template_data(self) -> dict: info = copy.deepcopy(self.config) if "meta" in info: info["meta"] = markdown(str(info["meta"])) info["impl"] = self statistics = self.gene_set_collection \ .get_gene_collection_count_statistics() if statistics is not None: info["number_of_gene_sets"] = ( statistics["number_of_gene_sets"] ) info["number_of_unique_genes"] = ( statistics["number_of_unique_genes"] ) else: info["number_of_gene_sets"] = "?" info["number_of_unique_genes"] = "?" return info
[docs] def create_statistics_build_tasks( self, **kwargs: Any, # noqa: ARG002 ) -> list[TaskDesc]: return [ TaskGraph.make_task( f"{self.resource.resource_id}_compute_and_save_all_statistics", self._compute_and_save_all_statistics, args=[], deps=[], ), ]
def _compute_and_save_all_statistics(self) -> None: self.gene_set_collection.load() hist = self._calc_genes_per_gene_set_hist() self._save_genes_per_gene_set_hist(hist) hist = self._calc_gene_sets_per_gene_hist() self._save_gene_sets_per_gene_hist(hist) self._compute_and_save_gene_statistics() self._compute_and_save_gene_sets_list_statistics()
[docs] def get_info(self, **kwargs: Any) -> str: # noqa: ARG002 return InfoImplementationMixin.get_info(self)
[docs] def get_statistics_info(self, **kwargs: Any) -> str: # noqa: ARG002 return InfoImplementationMixin.get_statistics_info(self)
[docs] def calc_info_hash(self) -> bytes: return b"placeholder"
[docs] def calc_statistics_hash(self) -> bytes: manifest = self.resource.get_manifest() result = { "files_md5": { fn: manifest[fn].md5 for fn in self.gene_set_collection.files }, } if self.gene_set_collection.config.histograms is not None: hist_configs = self.gene_set_collection.config.histograms if hist_configs.get("genes_per_gene_set") is not None: result["genes_per_gene_set"] = ( hist_configs["genes_per_gene_set"].model_dump( exclude_unset=True)) if hist_configs.get("gene_sets_per_gene") is not None: result["gene_sets_per_gene"] = ( hist_configs["gene_sets_per_gene"].model_dump( exclude_unset=True)) return json.dumps( result, indent=2, sort_keys=True).encode()
def _calc_genes_per_gene_set_hist( self, ) -> NumberHistogram | CategoricalHistogram: gene_set_collection = self.gene_set_collection config = gene_set_collection.config genes_per_gene_set_schema = ( config.histograms.get("genes_per_gene_set") if config and config.histograms else None ) hist_config = ( genes_per_gene_set_schema.model_dump(exclude_unset=True) if genes_per_gene_set_schema else {} ) histogram: NumberHistogram | CategoricalHistogram if hist_config.get("type") == "number": view_range = [] if hist_config.get("view_range") is None: all_gene_sets = gene_set_collection.get_all_gene_sets() min_count = min(all_gene_sets, key=lambda gs: gs.count).count max_count = max(all_gene_sets, key=lambda gs: gs.count).count view_range = [min_count, max_count] else: view_range = [ hist_config["view_range"].get("min"), hist_config["view_range"].get("max"), ] hist_config["view_range"] = view_range hist_config.pop("type", None) histogram = NumberHistogram( NumberHistogramConfig( **hist_config, ), ) else: hist_config.pop("type", None) hist_config["allow_only_whole_values_y"] = True histogram = CategoricalHistogram( CategoricalHistogramConfig(**hist_config)) for gs in gene_set_collection.get_all_gene_sets(): histogram.add_value(gs.count) return histogram def _calc_gene_sets_per_gene_hist( self, ) -> NumberHistogram | CategoricalHistogram: gs_config = self.gene_set_collection.config gene_sets_per_gene_schema = ( gs_config.histograms.get("gene_sets_per_gene") if gs_config and gs_config.histograms else None ) hist_config = ( gene_sets_per_gene_schema.model_dump(exclude_unset=True) if gene_sets_per_gene_schema else {} ) histogram: NumberHistogram | CategoricalHistogram if hist_config.get("type") == "number": view_range = [] if hist_config.get("view_range") is None: all_gene_sets = self.gene_set_collection.get_all_gene_sets() gene_counter = Counter( gene for gs in all_gene_sets for gene in set(gs.syms) ) max_gene_count = max(gene_counter.values(), default=0) view_range = [0, max_gene_count] else: view_range = [ hist_config["view_range"].get("min"), hist_config["view_range"].get("max"), ] hist_config["view_range"] = view_range hist_config.pop("type", None) histogram = NumberHistogram( NumberHistogramConfig( **hist_config, ), ) else: hist_config.pop("type", None) histogram = CategoricalHistogram( CategoricalHistogramConfig(**hist_config)) gene_counter = Counter( gene for gs in self.gene_set_collection.get_all_gene_sets() for gene in set(gs.syms) ) for count in gene_counter.values(): histogram.add_value(count) return histogram def _save_genes_per_gene_set_hist( self, histogram: CategoricalHistogram | NumberHistogram, ) -> CategoricalHistogram | NumberHistogram: proto = self.resource.proto with proto.open_raw_file( self.resource, "statistics/genes_per_gene_set_histogram.json", mode="wt", ) as outfile: outfile.write(histogram.serialize()) plot_histogram( self.resource, self.gene_set_collection .get_genes_per_gene_set_hist_image_filename(), histogram, "gene count per gene set", "count of gene sets", ) return histogram def _save_gene_sets_per_gene_hist( self, histogram: CategoricalHistogram | NumberHistogram, ) -> CategoricalHistogram | NumberHistogram: proto = self.resource.proto with proto.open_raw_file( self.resource, "statistics/gene_sets_per_gene_histogram.json", mode="wt", ) as outfile: outfile.write(histogram.serialize()) plot_histogram( self.resource, self.gene_set_collection .get_gene_sets_per_gene_hist_image_filename(), histogram, "number of gene sets the gene is present in", "number of genes", ) return histogram
[docs] @staticmethod def get_schema() -> dict[str, Any]: raise NotImplementedError