Source code for utils.testing

# pylint: disable=C0116
import pathlib
import textwrap

from gpf_instance.gpf_instance import WGPFInstance

from dae.gene_sets.denovo_gene_set_helpers import DenovoGeneSetHelpers
from dae.genomic_resources import build_genomic_resource_repository
from dae.genomic_resources.cli import cli_manage
from dae.genomic_resources.repository import (
    GR_CONF_FILE_NAME,
    GenomicResourceRepo,
)
from dae.gpf_instance.gpf_instance import GPFInstance
from dae.pheno.pheno_import import main as pheno_import
from dae.studies.study import GenotypeData
from dae.testing import (
    setup_directories,
    setup_gpf_instance,
    setup_pedigree,
    setup_vcf,
    vcf_study,
)
from dae.testing.import_helpers import setup_dataset_config
from dae.testing.t4c8_import import t4c8_genes, t4c8_genome


[docs] def setup_t4c8_grr( root_path: pathlib.Path, ) -> GenomicResourceRepo: repo_path = root_path / "t4c8_grr" t4c8_genome(repo_path) t4c8_genes(repo_path) setup_directories( repo_path / "gene_scores" / "t4c8_score", { GR_CONF_FILE_NAME: """ type: gene_score filename: t4c8_gene_score.csv scores: - id: t4c8_score histogram: type: number number_of_bins: 3 x_log_scale: false y_log_scale: false meta: description: t4c8 gene score """, "t4c8_gene_score.csv": textwrap.dedent(""" gene,t4c8_score t4,10.123456789 c8,20.0 """), }, ) setup_directories( repo_path / "gene_sets" / "main", { GR_CONF_FILE_NAME: """ type: gene_set id: main format: directory directory: main_gene_sets web_label: Main web_format_str: "key| (|count|): |desc" meta: description: t4c8 main gene sets """, "main_gene_sets": { "t4_candidates.txt": textwrap.dedent( """t4_candidates T4 Candidates t4 """), "c8_candidates.txt": textwrap.dedent( """c8_candidates C8 Candidates c8 """), "all_candidates.txt": textwrap.dedent( """all_candidates All Candidates t4 c8 """), }, }, ) setup_directories( repo_path / "genomic_scores" / "score_one", { GR_CONF_FILE_NAME: textwrap.dedent(""" type: position_score table: filename: data.txt scores: - id: score_one type: float name: score """), "data.txt": textwrap.dedent(""" chrom\tpos_begin\tscore chr1\t4\t0.01 chr1\t54\t0.02 chr1\t90\t0.03 chr1\t100\t0.04 chr1\t119\t0.05 chr1\t122\t0.06 """), }, ) setup_directories( repo_path / "coding_len_background", { "genomic_resource.yaml": textwrap.dedent(""" type: gene_score filename: coding_len_background.tsv separator: "\t" scores: - id: gene_weight name: t4c8CodingLenBackground histogram: type: number number_of_bins: 10 view_range: min: 0 max: 20 meta: description: T4C8 gene coding length enrichment background model """), "coding_len_background.tsv": "gene\tgene_weight\nT4\t44\nC8\t45", }, ) cli_manage(["repo-repair", "-R", str(repo_path), "-j", "1"]) return build_genomic_resource_repository({ "id": "t4c8_local", "type": "directory", "directory": str(repo_path), })
[docs] def setup_t4c8_instance( root_path: pathlib.Path, ) -> GPFInstance: t4c8_grr = setup_t4c8_grr(root_path) instance_path = root_path / "gpf_instance" _t4c8_default_study_config(instance_path) setup_directories( instance_path, { "gpf_instance.yaml": textwrap.dedent(""" instance_id: t4c8_instance annotation: conf_file: annotation.yaml reference_genome: resource_id: t4c8_genome gene_models: resource_id: t4c8_genes gene_scores_db: gene_scores: - "gene_scores/t4c8_score" gene_sets_db: gene_set_collections: - gene_sets/main default_study_config: conf_file: default_study_configuration.yaml genotype_storage: default: duckdb_wgpf_test storages: - id: duckdb_wgpf_test storage_type: duckdb_parquet memory_limit: 16GB base_dir: '%(wd)s/duckdb_storage' gpfjs: visible_datasets: - t4c8_dataset - t4c8_study_1 - nonexistend_dataset """), "annotation.yaml": textwrap.dedent(""" - position_score: genomic_scores/score_one """), }, ) _study_1_pheno( root_path, instance_path, ) gpf_instance = setup_gpf_instance( instance_path, grr=t4c8_grr, ) _t4c8_study_1(root_path, gpf_instance) _t4c8_study_2(root_path, gpf_instance) _t4c8_dataset(gpf_instance) _t4c8_study_4(root_path, gpf_instance) gpf_instance.reload() for study_id in [ "t4c8_study_1", "t4c8_study_2", "t4c8_dataset", "t4c8_study_4"]: study = gpf_instance.get_genotype_data(study_id) assert study is not None, study_id DenovoGeneSetHelpers.build_collection(study) gpf_instance.reload() return gpf_instance
def _t4c8_study_1( root_path: pathlib.Path, t4c8_instance: GPFInstance, ) -> GenotypeData: ped_path = setup_pedigree( root_path / "t4c8_study_1" / "pedigree" / "in.ped", """ familyId personId dadId momId sex status role phenotype f1.1 mom1 0 0 2 1 mom unaffected f1.1 dad1 0 0 1 1 dad unaffected f1.1 p1 dad1 mom1 2 2 prb autism f1.1 s1 dad1 mom1 1 1 sib unaffected f1.3 mom3 0 0 2 1 mom unaffected f1.3 dad3 0 0 1 1 dad unaffected f1.3 p3 dad3 mom3 2 2 prb autism f1.3 s3 dad3 mom3 2 1 sib unaffected """) vcf_path1 = setup_vcf( root_path / "t4c8_study_1" / "vcf" / "in.vcf.gz", """ ##fileformat=VCFv4.2 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##contig=<ID=chr1> ##contig=<ID=chr2> ##contig=<ID=chr3> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mom1 dad1 p1 s1 mom3 dad3 p3 s3 chr1 4 . T G,TA . . . GT 0/1 0/1 0/0 0/0 0/1 0/2 0/2 0/0 chr1 54 . T C . . . GT 0/1 0/1 0/1 0/1 0/0 0/0 0/1 0/1 chr1 90 . G C,GA . . . GT 0/1 0/2 0/2 0/2 0/1 0/2 0/1 0/2 chr1 100 . T G,TA . . . GT 0/1 0/1 0/0 0/0 0/2 0/0 0/0 0/0 chr1 119 . A G,C . . . GT 0/0 0/0 0/2 0/2 0/1 0/2 0/1 0/2 chr1 122 . A C,AC . . . GT 0/1 0/1 0/1 0/1 0/2 0/2 0/2 0/1 """) # noqa: E501 return vcf_study( root_path, "t4c8_study_1", ped_path, [vcf_path1], t4c8_instance, project_config_update={ "input": { "vcf": { "denovo_mode": "denovo", "omission_mode": "omission", }, }, }, study_config_update={ "phenotype_data": "study_1_pheno", }, ) def _t4c8_study_2( root_path: pathlib.Path, t4c8_instance: GPFInstance, ) -> GenotypeData: ped_path = setup_pedigree( root_path / "t4c8_study_2" / "pedigree" / "in.ped", """ familyId personId dadId momId sex status role f2.1 mom1 0 0 2 1 mom f2.1 dad1 0 0 1 1 dad f2.1 ch1 dad1 mom1 2 2 prb f2.3 mom3 0 0 2 1 mom f2.3 dad3 0 0 1 1 dad f2.3 ch3 dad3 mom3 2 2 prb f2.3 ch4 dad3 mom3 2 0 prb """) vcf_path1 = setup_vcf( root_path / "t4c8_study_2" / "vcf" / "in.vcf.gz", """ ##fileformat=VCFv4.2 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##contig=<ID=chr1> ##contig=<ID=chr2> ##contig=<ID=chr3> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mom1 dad1 ch1 mom3 dad3 ch3 ch4 chr1 5 . A C . . . GT 0/0 0/0 0/1 0/0 0/0 0/0 0/1 chr1 6 . C G . . . GT 0/0 0/0 0/0 0/0 0/0 0/1 0/0 chr1 7 . G T . . . GT 0/0 1/0 0/1 0/0 0/0 0/0 0/1 """) project_config_update = { "input": { "vcf": { "denovo_mode": "denovo", "omission_mode": "omission", }, }, } return vcf_study( root_path, "t4c8_study_2", ped_path, [vcf_path1], t4c8_instance, project_config_update=project_config_update, study_config_update={ "conf_dir": str(root_path / "t4c8_study_2"), "person_set_collections": { "phenotype": { "id": "phenotype", "name": "Phenotype", "sources": [ { "from": "pedigree", "source": "status", }, ], "default": { "color": "#aaaaaa", "id": "unspecified", "name": "unspecified", }, "domain": [ { "color": "#bbbbbb", "id": "epilepsy", "name": "epilepsy", "values": [ "affected", ], }, { "color": "#00ff00", "id": "unaffected", "name": "unaffected", "values": [ "unaffected", ], }, ], }, "selected_person_set_collections": [ "phenotype", ], }, }) def _t4c8_dataset( t4c8_instance: GPFInstance, ) -> None: root_path = pathlib.Path(t4c8_instance.dae_dir) (root_path / "datasets").mkdir(exist_ok=True) setup_dataset_config( t4c8_instance, "t4c8_dataset", ["t4c8_study_1", "t4c8_study_2"], dataset_config_update=textwrap.dedent(f""" conf_dir: { root_path / "dataset "} person_set_collections: phenotype: id: phenotype name: Phenotype sources: - from: pedigree source: status domain: - color: '#4b2626' id: developmental_disorder name: developmental disorder values: - affected - color: '#ffffff' id: unaffected name: unaffected values: - unaffected default: color: '#aaaaaa' id: unspecified name: unspecified selected_person_set_collections: - phenotype"""))
[docs] def setup_wgpf_instance(root_path: pathlib.Path) -> WGPFInstance: t4c8_instance = setup_t4c8_instance(root_path) t4c8_grr = t4c8_instance.grr root_path = pathlib.Path(t4c8_instance.dae_dir) instance_filename = str(root_path / "gpf_instance.yaml") return WGPFInstance.build(instance_filename, grr=t4c8_grr)
def _t4c8_default_study_config(instance_path: pathlib.Path) -> None: setup_directories( instance_path, { "default_study_configuration.yaml": textwrap.dedent(""" phenotype_browser: false phenotype_tool: false study_type: - WE study_phenotype: autism has_transmitted: true has_denovo: true has_complex: true has_cnv: false genome: hg38 chr_prefix: true person_set_collections: selected_person_set_collections: - phenotype phenotype: id: phenotype name: Phenotype sources: - from: pedigree source: status domain: - id: autism name: autism values: - affected color: '#ff2121' - id: unaffected name: unaffected values: - unaffected color: '#ffffff' default: id: unspecified name: unspecified color: '#aaaaaa' genotype_browser: enabled: true has_family_filters: true has_person_filters: true has_study_filters: false has_present_in_child: true has_present_in_parent: true has_pedigree_selector: true preview_columns: - family - variant - genotype - effect - gene_scores - freq - pheno_measures download_columns: - family - study_phenotype - variant - variant_extra - family_person_ids - family_structure - best - family_genotype - carriers - inheritance - phenotypes - par_called - allele_freq - effect - geneeffect - effectdetails - gene_scores - pheno_measures summary_preview_columns: - variant - seen_as_denovo - seen_in_affected - seen_in_unaffected - par_called - allele_freq - effect - count - geneeffect - effectdetails - gene_scores summary_download_columns: - variant - seen_as_denovo - seen_in_affected - seen_in_unaffected - par_called - allele_freq - effect - count - geneeffect - effectdetails - gene_scores column_groups: genotype: name: genotype columns: - pedigree - carrier_person_attributes - family_person_attributes effect: name: effect columns: - worst_effect - genes gene_scores: name: vulnerability/intolerance columns: - t4c8_score family: name: family columns: - family_id - study variant: name: variant columns: - location - variant variant_extra: name: variant columns: - chrom - position - reference - alternative carriers: name: carriers columns: - carrier_person_ids - carrier_person_attributes phenotypes: name: phenotypes columns: - family_phenotypes - carrier_phenotypes freq: name: Frequency columns: - freq_ssc - freq_exome_gnomad - freq_genome_gnomad pheno_measures: name: pheno measures columns: - pheno_age - pheno_iq columns: genotype: pedigree: name: pedigree source: pedigree worst_effect: name: worst effect source: worst_effect genes: name: genes source: genes t4c8_score: name: t4c8 score source: t4c8_score format: '%%f' family_id: name: family id source: family study: name: study source: study_name family_person_ids: name: family person ids source: family_person_ids location: name: location source: location variant: name: variant source: variant chrom: name: CHROM source: chrom position: name: POS source: position reference: name: REF source: reference alternative: name: ALT source: alternative carrier_person_ids: name: carrier person ids source: carrier_person_ids carrier_person_attributes: name: carrier person attributes source: carrier_person_attributes family_person_attributes: name: family person attributes source: family_person_attributes family_phenotypes: name: family phenotypes source: family_phenotypes carrier_phenotypes: name: carrier phenotypes source: carrier_phenotypes inheritance: name: inheritance type source: inheritance_type study_phenotype: name: study phenotype source: study_phenotype best: name: family best state source: best_st family_genotype: name: family genotype source: genotype family_structure: name: family structure source: family_structure geneeffect: name: all effects source: effects effectdetails: name: effect details source: effect_details alt_alleles: name: alt alleles source: af_allele_count par_called: name: parents called source: af_parents_called_count allele_freq: name: allele frequency source: af_allele_freq seen_as_denovo: name: seen_as_denovo source: seen_as_denovo seen_in_affected: name: seen_in_affected source: seen_in_affected seen_in_unaffected: name: seen_in_unaffected source: seen_in_unaffected phenotype: pheno_age: role: prb source: "i1.age" format: "%%.3f" name: Age pheno_iq: role: prb source: "i1.iq" format: "%%.3f" name: IQ common_report: enabled: true effect_groups: - LGDs - nonsynonymous - UTRs - CNV effect_types: - Nonsense - Frame-shift - Splice-site - Missense - No-frame-shift - noStart - noEnd - Synonymous - Non coding - Intron - Intergenic - 3'-UTR - 5'-UTR denovo_gene_sets: enabled: true selected_person_set_collections: - phenotype standard_criterias: effect_types: segments: LGDs: LGDs Missense: missense Synonymous: synonymous sexes: segments: Female: F Male: M Unspecified: U recurrency_criteria: segments: Single: start: 1 end: 2 Triple: start: 3 end: -1 Recurrent: start: 2 end: -1 gene_sets_names: - LGDs - LGDs.Male - LGDs.Female - LGDs.Recurrent - LGDs.Single - LGDs.Triple - Missense - Missense.Male - Missense.Female - Missense.Recurrent - Missense.Triple - Synonymous - Synonymous.Male - Synonymous.Female - Synonymous.Recurrent - Synonymous.Triple enrichment: enabled: true selected_person_set_collections: - phenotype selected_background_models: - coding_len_background default_background_model: coding_len_background selected_counting_models: - enrichment_gene_counting - enrichment_events_counting counting: enrichment_gene_counting: id: enrichment_gene_counting name: Counting affected genes desc: Counting affected genes enrichment_events_counting: id: enrichment_events_counting name: Counting events desc: Counting events default_counting_model: enrichment_gene_counting effect_types: - Nonsense - Frame-shift - Splice-site - Missense - No-frame-shift - noStart - noEnd - Synonymous - Non coding - Intron - Intergenic - 3'-UTR - 5'-UTR - CNV+ - CNV- gene_browser: enabled: true frequency_column: "score_one" effect_column: "effect.worst effect type" location_column: "variant.location" domain_min: 0.01 domain_max: 100 """), }, ) def _study_1_pheno( root_path: pathlib.Path, instance_path: pathlib.Path, ) -> None: pheno_path = root_path / "study_1_pheno_import" ped_path = setup_pedigree( pheno_path / "pedigree" / "study_1_pheno.ped", textwrap.dedent(""" familyId personId dadId momId sex status role phenotype f1.1 mom1 0 0 2 1 mom unaffected f1.1 dad1 0 0 1 1 dad unaffected f1.1 p1 dad1 mom1 2 2 prb autism f1.1 s1 dad1 mom1 1 1 sib unaffected f1.2 mom2 0 0 2 1 mom unaffected f1.2 dad2 0 0 1 1 dad unaffected f1.2 p2 dad2 mom2 2 2 prb autism f1.2 s2 dad2 mom2 1 1 sib unaffected f1.3 mom3 0 0 2 1 mom unaffected f1.3 dad3 0 0 1 1 dad unaffected f1.3 p3 dad3 mom3 2 2 prb autism f1.3 s3 dad3 mom3 2 1 sib unaffected f1.4 mom4 0 0 2 1 mom unaffected f1.4 dad4 0 0 1 1 dad unaffected f1.4 p4 dad4 mom4 2 2 prb autism f1.4 s4 dad4 mom4 2 1 sib unaffected """), ) setup_directories( pheno_path / "instruments", { "i1.csv": textwrap.dedent(""" personId,age,iq,m1,m2,m3,m4,m5 mom1,495.85101568044115,97.50432405604393,52.81283557677513,30.02770124013255,71.37577329050546,7,val3 dad1,455.7415088310677,95.69209763066596,30.17069676417365,46.09107120958192,80.80918132613797,6,val5 p1,166.33975600961486,104.91189182223437,110.71113119414974,28.525899172698242,35.91763476048754,0,val3 s1,171.7517126432528,38.666056616173776,89.98648478019244,45.48364527683189,36.402944728465634,1,val2 mom2,538.9804553566523,77.21819916001459,54.140552015763305,46.634514570013124,57.885493130264315,5,val3 dad2,565.9100943623504,74.26681832043354,63.03565166617398,36.205901443513405,88.42665767730243,8,val4 p2,111.53800328766471,66.69411560428445,75.83138674585497,43.482874849182046,42.4619179257155,0,val2 s2,112.55713299362333,103.40031120687064,81.23597041806396,26.159521971641645,34.43553369099789,0,val3 mom3,484.44595137123844,65.76732558306583,91.03624223708377,60.66214100006954,82.3034749091715,6,val3 dad3,529.0340708815538,102.32942897750618,102.99152655929812,49.50549744685827,74.83036326691582,2,val1 p3,68.00148724003327,69.33300891928155,96.6345202846831,39.854725276645524,41.07164247649136,2,val1 s3,82.79666720433862,14.497397082398294,70.28387304358455,36.733060149749015,32.979273050187054,0,val3 mom4,413.46229185729595,100.18402999912475,80.87413378193011,56.58170217214086,52.756604936750776,2,val4 dad4,519.696209236225,95.17277547237524,50.73287772082178,34.58584942696778,63.241999271724694,2,val3 p4,157.61834502034586,103.07449426952655,99.54884909890457,37.31662520714209,50.87487739184816,2,val1 s4,121.0199895975403,39.74107684421966,77.32212831797972,51.37116746952451,36.558215318085175,1,val4 """), }) setup_directories( pheno_path, { "regressions.yaml": textwrap.dedent(""" regression: age: instrument_name: "i1" measure_name: "age" display_name: "Age" jitter: 0.1 iq: instrument_name: "i1" measure_name: "iq" display_name: "Non verbal IQ" jitter: 0.1 """), }) pheno_import([ "--pheno-id", "study_1_pheno", "-p", str(ped_path), "-i", str(pheno_path / "instruments"), "--force", "-j", "1", "--person-column", "personId", "-o", str(instance_path / "pheno" / "study_1_pheno"), "--task-status-dir", str(pheno_path / "status"), "--regression", str(pheno_path / "regressions.yaml"), ]) def _t4c8_study_4( root_path: pathlib.Path, t4c8_instance: GPFInstance, ) -> GenotypeData: ped_path = setup_pedigree( root_path / "t4c8_study_4" / "pedigree" / "in.ped", """ familyId personId dadId momId sex status role phenotype f4.1 mom4.1 0 0 2 1 mom unaffected f4.1 dad4.1 0 0 1 1 dad unaffected f4.1 p4.1 dad4.1 mom4.1 2 2 prb autism f4.1 s4.1 dad4.1 mom4.1 1 1 sib unaffected f4.3 mom4.3 0 0 2 1 mom unaffected f4.3 dad4.3 0 0 1 1 dad unaffected f4.3 p4.3 dad4.3 mom4.3 2 2 prb autism f4.3 s4.3 dad4.3 mom4.3 2 1 sib unaffected f4.5 mom4.5 0 0 2 1 mom unaffected f4.5 dad4.5 0 0 1 1 dad unaffected f4.5 p4.5 dad4.5 mom4.5 1 2 prb autism """) vcf_path1 = setup_vcf( root_path / "t4c8_study_4" / "vcf" / "in.vcf.gz", """ ##fileformat=VCFv4.2 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##contig=<ID=chr1> ##contig=<ID=chr2> ##contig=<ID=chr3> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mom4.1 dad4.1 p4.1 s4.1 mom4.3 dad4.3 p4.3 s4.3 mom4.5 dad4.5 p4.5 chr1 52 MIS C A . . . GT 0/0 0/0 0/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 chr1 54 SYN T C . . . GT 0/0 0/0 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/0 chr1 57 SYN A C . . . GT 0/0 0/0 0/1 0/0 0/0 0/0 0/1 0/1 0/0 0/0 0/1 chr1 117 MIS T G . . . GT 0/0 0/0 0/1 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/1 chr1 119 SYN A G . . . GT 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0 """) # noqa: E501 return vcf_study( root_path, "t4c8_study_4", ped_path, [vcf_path1], t4c8_instance, project_config_update={ "input": { "vcf": { "denovo_mode": "denovo", "omission_mode": "omission", }, }, }, )