Source code for dae.tools.dataset_helpers

import glob
import logging
import os
import shutil
from typing import Any, cast

import toml
from box import Box

from dae.genotype_storage.genotype_storage import GenotypeStorage
from dae.gpf_instance.gpf_instance import GPFInstance

logger = logging.getLogger(__name__)


[docs] class DatasetHelpers: """Helper class for work with studies in impala genotype storage.""" def __init__(self, gpf_instance: GPFInstance | None = None) -> None: if gpf_instance is None: self.gpf_instance = GPFInstance.build() else: self.gpf_instance = gpf_instance
[docs] def find_genotype_data_config_file( self, dataset_id: str, ) -> str | None: """Find and return config filename for a dataset.""" config = self.gpf_instance.get_genotype_data_config(dataset_id) if config is None: self.gpf_instance.reload() config = self.gpf_instance.get_genotype_data_config(dataset_id) if config is None: return None assert config is not None, dataset_id conf_dir = config.conf_dir result = glob.glob(os.path.join(conf_dir, "*.conf")) assert len(result) == 1, \ f"unexpected number of config files in {conf_dir}" config_file = result[0] assert os.path.exists(config_file) return config_file
[docs] def find_genotype_data_config(self, dataset_id: str) -> Box | None: """Find and return configuration of a dataset.""" config_file = self.find_genotype_data_config_file(dataset_id) if config_file is None: return None with open(config_file, "r") as infile: short_config = toml.load(infile) return Box(short_config)
[docs] def get_genotype_storage(self, dataset_id: str) -> GenotypeStorage | None: """Find the genotype storage that stores a dataset.""" config = self.find_genotype_data_config(dataset_id) if config is None: return None gpf_instance = self.gpf_instance return cast( GenotypeStorage | None, gpf_instance .genotype_storages .get_genotype_storage( config.genotype_storage.id))
[docs] def rename_study_config( self, dataset_id: str, new_id: str, config_content: dict[str, Any], *, dry_run: bool | None = None, ) -> None: """Rename study config for a dataset.""" config_file = self.find_genotype_data_config_file(dataset_id) if config_file is None: return logger.info("going to disable config file %s", config_file) if not dry_run: os.rename(config_file, f"{config_file}_bak") config_dirname = os.path.dirname(config_file) new_dirname = os.path.join(os.path.dirname(config_dirname), new_id) logger.info( "going to rename config directory %s to %s", config_dirname, new_dirname) if not dry_run: os.rename(config_dirname, new_dirname) new_config_file = os.path.join(new_dirname, f"{new_id}.conf") logger.info("going to create a new config file %s", new_config_file) if not dry_run: with open(new_config_file, "wt") as outfile: content = toml.dumps(config_content) outfile.write(content)
[docs] def remove_study_config(self, dataset_id: str) -> None: """Remove study config for a dataset.""" config_file = self.find_genotype_data_config_file(dataset_id) if config_file is None: logger.warning("config file for dataset %s not found", dataset_id) return config_dir = os.path.dirname(config_file) shutil.rmtree(config_dir)
[docs] def disable_study_config( self, dataset_id: str, *, dry_run: bool | None = None, ) -> None: """Disable dataset.""" config_file = self.find_genotype_data_config_file(dataset_id) if config_file is None: logger.warning("config file for dataset %s not found", dataset_id) return config_dir = os.path.dirname(config_file) logger.info("going to disable study_config %s", config_file) if not dry_run: os.rename(config_file, f"{config_file}_bak") os.rename(config_dir, f"{config_dir}_bak")