Source code for dae.enrichment_tool.enrichment_cache_builder

import argparse
import json
import logging
from dataclasses import asdict
from pathlib import Path

from dae.effect_annotation.effect import expand_effect_types
from dae.enrichment_tool.enrichment_utils import (
    EnrichmentEventCounts,
    get_enrichment_cache_path,
    get_enrichment_config,
)
from dae.enrichment_tool.event_counters import (
    EVENT_COUNTERS,
    EventCountersResult,
)
from dae.enrichment_tool.genotype_helper import GenotypeHelper
from dae.gpf_instance import GPFInstance
from dae.studies.study import GenotypeData
from dae.utils.verbosity_configuration import VerbosityConfiguration

logger = logging.getLogger("enrichment_cache_builder")



[docs]
def cli(
    argv: list[str] | None = None,
    gpf_instance: GPFInstance | None = None,
) -> None:
    """Generate enrichment tool cache."""
    description = "Generate enrichment tool cache"
    parser = argparse.ArgumentParser(description=description)
    VerbosityConfiguration.set_arguments(parser)

    parser.add_argument(
        "--show-studies",
        help="This option will print available "
        "genotype studies and groups names",
        default=False,
        action="store_true",
    )
    parser.add_argument(
        "--studies",
        help="Specify genotype studies and groups "
        "names for generating enrichment cache. Default to all.",
        default=None,
        action="store",
    )

    args = parser.parse_args(argv)

    VerbosityConfiguration.set(args)
    logging.getLogger("impala").setLevel(logging.WARNING)

    if gpf_instance is None:
        gpf_instance = GPFInstance.build()

    if args.show_studies:
        for study in gpf_instance.get_all_genotype_data():
            if get_enrichment_config(study) is not None:
                print(study.study_id)
    else:
        if args.studies:
            study_ids = args.studies.split(",")
        else:
            study_ids = gpf_instance.get_genotype_data_ids()

        filtered_studies = []
        for study_id in study_ids:
            study = gpf_instance.get_genotype_data(study_id)
            if get_enrichment_config(study) is not None:
                filtered_studies.append(study)
        logger.warning(
            "generating enrichment cache for studies: %s",
            [st.study_id for st in filtered_studies])

        for study in filtered_studies:
            logger.info(
                "building enrichment cache for study %s", study.study_id)
            enrichment_config = get_enrichment_config(study)
            assert enrichment_config is not None
            psc_id = enrichment_config["selected_person_set_collections"][0]
            build_enrichment_event_counts_cache(study, psc_id)




[docs]
def build_enrichment_event_counts_cache(
    study: GenotypeData,
    psc_id: str,
) -> None:
    """Build enrichment event counts cache for a genotype data."""
    psc = study.get_person_set_collection(psc_id)
    assert psc is not None

    enrichment_config = get_enrichment_config(study)
    if enrichment_config is None:
        return

    assert enrichment_config is not None

    effect_groups = enrichment_config["effect_types"]
    query_effect_types = expand_effect_types(effect_groups)
    genotype_helper = GenotypeHelper(
        study, psc, effect_types=query_effect_types)
    result: EnrichmentEventCounts = {}
    for counter_id, counter in EVENT_COUNTERS.items():
        result[counter_id] = {}
        for ps_id, person_set in psc.person_sets.items():
            result[counter_id][ps_id] = {}
            for effect_group in effect_groups:
                effect_group_expanded = expand_effect_types(effect_group)
                events = counter.events(
                    genotype_helper.get_denovo_events(),
                    person_set.get_children_by_sex(),
                    effect_group_expanded)
                counts = EventCountersResult.from_events_result(events)
                result[counter_id][ps_id][effect_group] = asdict(counts)

    cache_path = get_enrichment_cache_path(study)
    Path(cache_path).write_text(json.dumps(result, indent=4))