Source code for dae.genomic_resources.genomic_position_table.table_vcf

from __future__ import annotations

from collections.abc import Generator
from functools import cache

import pysam

from dae.genomic_resources.repository import GenomicResource

from .line import VCFLine
from .table_tabix import TabixGenomicPositionTable


[docs] class VCFGenomicPositionTable(TabixGenomicPositionTable): """Represents a VCF file genome position table.""" CHROM = "CHROM" POS_BEGIN = "POS" POS_END = "POS" def __init__( self, genomic_resource: GenomicResource, table_definition: dict): super().__init__(genomic_resource, table_definition) self.header = self._load_vcf_header() def _load_vcf_header(self) -> pysam.VariantHeaderMetadata: assert self.definition.get("header_mode", "file") == "file" filename = self.definition.filename idx = filename.index(".vcf") header_filename = filename[:idx] + ".header" + filename[idx:] assert self.genomic_resource.file_exists(header_filename), \ "VCF tables must have an accompanying *.header.vcf.gz file!" return self.genomic_resource.open_vcf_file(header_filename).header.info def _transform_vcf_result(self, line: VCFLine) -> None: rchrom = self._map_result_chrom(line.chrom) assert rchrom is not None line.chrom = rchrom def _make_vcf_line( self, raw_line: pysam.VariantRecord, allele_index: int | None, ) -> VCFLine | None: line: VCFLine = VCFLine(raw_line, allele_index) if not self.rev_chrom_map: return line if line.fchrom in self.rev_chrom_map: self._transform_vcf_result(line) return line return None
[docs] def open(self) -> VCFGenomicPositionTable: self.pysam_file = self.genomic_resource.open_vcf_file( self.definition.filename) self._set_core_column_keys() self._build_chrom_mapping() return self
[docs] @cache # pylint: disable=method-cache-max-size-none def get_file_chromosomes(self) -> list[str]: with self.genomic_resource.open_tabix_file( self.definition.filename) as pysam_file_tabix: contigs = pysam_file_tabix.contigs return list(map(str, contigs))
[docs] def get_line_iterator( self, chrom: str | None = None, pos_begin: int | None = None, ) -> Generator[VCFLine | None, None, None]: assert isinstance(self.pysam_file, pysam.VariantFile) if chrom is not None: fchrom = self.unmap_chromosome(chrom) if fchrom is None: raise ValueError( f"error in mapping chromosome {chrom} to file contigs: " f"{self.get_file_chromosomes()}") else: fchrom = None self.stats["tabix fetch"] += 1 self.buffer.clear() for raw_line in self.pysam_file.fetch(fchrom, pos_begin): allele_index: int | None for allele_index, alt in enumerate(raw_line.alts or [None]): assert raw_line.ref is not None allele_index = allele_index if alt is not None else None line = self._make_vcf_line(raw_line, allele_index) yield line