Source code for dae.annotation.gene_set_annotator

import logging
from typing import Any

from dae.annotation.annotatable import Annotatable
from dae.annotation.annotation_config import (
    AnnotationConfigParser,
    AnnotatorInfo,
)
from dae.annotation.annotation_pipeline import (
    AnnotationPipeline,
    Annotator,
)
from dae.annotation.annotator_base import AnnotatorBase
from dae.gene_sets.gene_sets_db import (
    GeneSet,
    build_gene_set_collection_from_resource,
)
from dae.genomic_resources import GenomicResource

logger = logging.getLogger(__name__)


[docs] def build_gene_set_annotator( pipeline: AnnotationPipeline, info: AnnotatorInfo, ) -> Annotator: """Create a gene set annotator.""" gene_set_resource_id = info.parameters["resource_id"] if not gene_set_resource_id: raise ValueError(f"The {info} needs a 'resource_id' parameter.") gene_set_resource = pipeline.repository.get_resource( gene_set_resource_id) if gene_set_resource is None: raise ValueError(f"The {gene_set_resource_id} is not available.") input_gene_list = info.parameters.get("input_gene_list") if input_gene_list is None: raise ValueError(f"The {input} must have an 'input_gene_list' " "parameter") input_gene_list_info = pipeline.get_attribute_info(input_gene_list) if input_gene_list_info is None: raise ValueError(f"The {input_gene_list} is not privided by the " "pipeline.") if input_gene_list_info.type != "object": raise ValueError(f"The {input_gene_list} privided by the pipeline " "is not of type object.") return GeneSetAnnotator( pipeline, info, gene_set_resource, input_gene_list, )
[docs] class GeneSetAnnotator(AnnotatorBase): """Gene set annotator class.""" def __init__( self, pipeline: AnnotationPipeline | None, info: AnnotatorInfo, gene_set_resource: GenomicResource, input_gene_list: str, ): self.gene_set_resource = gene_set_resource self.gene_set_collection = build_gene_set_collection_from_resource( self.gene_set_resource) self.gene_sets: list[GeneSet] | None = None self.input_gene_list = input_gene_list info.resources += [gene_set_resource] info.documentation = ( "This gene set collection annotator uses the " f"**{self.gene_set_collection.collection_id}** " f"gene set collection." ) source_type_desc = self._build_source_type_desc(info) super().__init__(pipeline, info, source_type_desc) def _build_source_type_desc( self, info: AnnotatorInfo, ) -> dict[str, tuple[str, str]]: gene_sets_list = self.gene_set_collection \ .get_gene_sets_list_statistics() if gene_sets_list is None: logger.info( "The gene set collection statistics for %s is empty.", self.gene_set_collection.collection_id, ) self.gene_set_collection.load() gene_sets_list = [ {"name": gs.name, "count": gs.count, "desc": gs.desc or gs.name} for gs in sorted( self.gene_set_collection.get_all_gene_sets(), key=lambda gs: (-gs.count, gs.name), ) ] in_sets_desc = ( "object", ( "List of the gene sets of the collection, " "which have at least one gene from the input gene " "list" )) if info.attributes: gene_sets_desc = {gs["name"]: gs["desc"] for gs in gene_sets_list} source_type_desc = {} for attr in info.attributes: if attr.source == "in_sets": source_type_desc["in_sets"] = in_sets_desc continue if attr.source not in gene_sets_desc: raise ValueError( f"The attribute {attr.source} is not found in the " f"gene set collection " f"{self.gene_set_collection.collection_id}.") source_type_desc[attr.source] = ( "bool", gene_sets_desc[attr.source]) else: source_type_desc = { gs["name"]: ("bool", gs["desc"]) for gs in gene_sets_list[:20] } source_type_desc["in_sets"] = in_sets_desc info.attributes = AnnotationConfigParser.parse_raw_attributes([ *source_type_desc.keys(), ]) return source_type_desc @property def used_context_attributes(self) -> tuple[str, ...]: return (self.input_gene_list,)
[docs] def open(self) -> Annotator: self.gene_set_collection.load() self.gene_sets = self.gene_set_collection.get_all_gene_sets() super().open() return self
def _do_annotate( self, annotatable: Annotatable | None, # noqa: ARG002 context: dict[str, Any], ) -> dict[str, Any]: genes = context.get(self.input_gene_list) if genes is None: return self._empty_result() genes_set = set(genes) in_sets: list[str] = [] output: dict[str, Any] = {"in_sets": in_sets} if self.gene_sets is None: raise ValueError( f"The GeneSetAnnotator {self.gene_set_resource} " f"is not open.") for gs in self.gene_sets: output[gs.name] = False if genes_set.intersection(set(gs.syms)): output[gs.name] = True in_sets.append(gs.name) return output