Source code for dae.annotation.record_to_annotatable

from __future__ import annotations

import abc
import argparse
import logging

from dae.annotation.annotatable import (
    Annotatable,
    CNVAllele,
    Position,
    Region,
    VCFAllele,
)
from dae.genomic_resources.reference_genome import ReferenceGenome
from dae.utils.cnv_utils import cnv_variant_type, cshl2cnv_variant
from dae.utils.dae_utils import cshl2vcf_variant, dae2vcf_variant

logger = logging.getLogger(__name__)


[docs] class RecordToAnnotable(abc.ABC): """Base class for record to annotable transformation.""" def __init__(self, columns: tuple, ref_genome: ReferenceGenome | None): self.columns = columns self.ref_genome = ref_genome
[docs] @abc.abstractmethod def build(self, record: dict[str, str]) -> Annotatable: """Constructs an annotatable from a record."""
[docs] class RecordToPosition(RecordToAnnotable): def __init__(self, columns: tuple, ref_genome: ReferenceGenome | None): super().__init__(columns, ref_genome) self.chrom_column, self.pos_column = columns
[docs] def build(self, record: dict[str, str]) -> Annotatable: return Position(record[self.chrom_column], int(record[self.pos_column]))
[docs] class RecordToRegion(RecordToAnnotable): def __init__(self, columns: tuple, ref_genome: ReferenceGenome | None): super().__init__(columns, ref_genome) self.chrom_col, self.pos_beg_col, self.pos_end_col = columns
[docs] def build(self, record: dict[str, str]) -> Annotatable: return Region(record[self.chrom_col], int(record[self.pos_beg_col]), int(record[self.pos_end_col]))
[docs] class RecordToVcfAllele(RecordToAnnotable): def __init__(self, columns: tuple, ref_genome: ReferenceGenome | None): super().__init__(columns, ref_genome) self.chrom_col, self.pos_col, self.ref_col, self.alt_col = columns
[docs] def build(self, record: dict[str, str]) -> Annotatable: return VCFAllele(record[self.chrom_col], int(record[self.pos_col]), record[self.ref_col], record[self.alt_col])
[docs] class VcfLikeRecordToVcfAllele(RecordToAnnotable): """Transform a columns record into VCF allele annotatable.""" def __init__(self, columns: tuple, ref_genome: ReferenceGenome | None): super().__init__(columns, ref_genome) self.vcf_like_col, = columns
[docs] def build(self, record: dict[str, str]) -> Annotatable: chrom, pos, ref, alt = record[self.vcf_like_col].split(":") return VCFAllele(chrom, int(pos), ref, alt)
[docs] class RecordToCNVAllele(RecordToAnnotable): """Transform a columns record into a CNV allele annotatable.""" def __init__(self, columns: tuple, ref_genome: ReferenceGenome | None): super().__init__(columns, ref_genome) self.chrom_col, self.pos_beg_col, self.pos_end_col, self.cnv_type_col \ = columns
[docs] def build(self, record: dict[str, str]) -> Annotatable: cnv_type = cnv_variant_type(record[self.cnv_type_col]) if cnv_type is None: raise ValueError( f"unexpected CNV variant type: {record[self.cnv_type_col]}") return CNVAllele( record[self.chrom_col], int(record[self.pos_beg_col]), int(record[self.pos_end_col]), CNVAllele.Type.from_string(cnv_type))
[docs] class CSHLAlleleRecordToAnnotatable(RecordToAnnotable): """Transform a CSHL variant record into a VCF allele annotatable.""" def __init__(self, columns: tuple, ref_genome: ReferenceGenome | None): super().__init__(columns, ref_genome) self.location_col, self.variant_col = columns
[docs] def build(self, record: dict[str, str]) -> Annotatable: variant = record[self.variant_col] cnv_type = cnv_variant_type(variant) if cnv_type is not None: chrom, pos_begin, pos_end, cnv_type = cshl2cnv_variant( record[self.location_col], record[self.variant_col]) assert cnv_type is not None return CNVAllele( chrom, pos_begin, pos_end, CNVAllele.Type.from_string(cnv_type)) return VCFAllele(*cshl2vcf_variant( record[self.location_col], record[self.variant_col], self.ref_genome))
[docs] class DaeAlleleRecordToAnnotatable(RecordToAnnotable): """Transform a CSHL variant record into a VCF allele annotatable.""" def __init__(self, columns: tuple, ref_genome: ReferenceGenome | None): super().__init__(columns, ref_genome) self.chrom_column, self.pos_column, self.variant_column = columns
[docs] def build(self, record: dict[str, str]) -> Annotatable: variant = record[self.variant_column] chrom = record[self.chrom_column] return VCFAllele(chrom, *dae2vcf_variant( chrom, int(record[self.pos_column]), variant, self.ref_genome))
RECORD_TO_ANNOTATABLE_CONFIGURATION: dict[tuple, type[RecordToAnnotable]] = { ("chrom", "pos_beg", "pos_end", "cnv_type"): RecordToCNVAllele, ("chrom", "pos_beg", "pos_end"): RecordToRegion, ("chrom", "pos", "ref", "alt"): RecordToVcfAllele, ("vcf_like",): VcfLikeRecordToVcfAllele, ("chrom", "pos", "variant"): DaeAlleleRecordToAnnotatable, ("location", "variant"): CSHLAlleleRecordToAnnotatable, ("chrom", "pos"): RecordToPosition, }
[docs] def add_record_to_annotable_arguments(parser: argparse.ArgumentParser) -> None: all_columns = { col for cols in RECORD_TO_ANNOTATABLE_CONFIGURATION for col in cols} for col in all_columns: parser.add_argument( f"--col-{col.replace('_', '-')}", default=col, help=( f"The column name that stores {col}. " f'Use "-" to exclude this column requirement, causing ' f'annotatable patterns that require {col} to be skipped.' ), )
[docs] def build_record_to_annotatable( renamed_columns: dict[str, str], available_columns: set[str], ref_genome: ReferenceGenome | None = None) -> RecordToAnnotable: """ Transform a variant record into an annotatable. Parameters ---------- renamed_columns : dict[str, str] Mapping from expected internal column identifiers (e.g. "col_<field>") to the actual column names present in the input source. A column can be excluded from usage if an identifier is mapped to "-". Example rename: "col_<field>": "<input source column name for the field>" Example exclude: "col_<field>": "-" available_columns : set[str] The set of column names available in the input records. ref_genome : ReferenceGenome | None, optional Optional reference genome context used for creating annotatables. Not all annotatables require it. """ for annotatable_columns, record_to_annotatable_class in \ RECORD_TO_ANNOTATABLE_CONFIGURATION.items(): columns = [ renamed_columns.get(f"col_{annot_col}", annot_col) for annot_col in annotatable_columns ] if set(columns).issubset(available_columns): logger.info( "record to annotatable using %s(%s, ref_genome=%s)", record_to_annotatable_class.__name__, tuple(columns), ref_genome.resource_id if ref_genome else None, ) return record_to_annotatable_class( tuple(columns), ref_genome, ) raise ValueError("no record to annotatable could be found.")