from __future__ import annotations
import logging
from typing import Any
from dae.utils.regions import (
BedRegion,
)
logger = logging.getLogger(__name__)
[docs]
class Exon:
"""Represent a single exon within a transcript.
An exon is a segment of a transcript that is retained in the mature RNA
after splicing. This class stores the genomic coordinates and codon
reading frame of an exon.
Attributes:
start (int): Genomic start position (1-based, inclusive).
stop (int): Genomic end position (1-based, inclusive).
frame (int | None): Codon reading frame (0, 1, or 2) for coding
exons, or None for non-coding exons or UTR regions.
Example:
>>> exon = Exon(start=100, stop=200, frame=0)
>>> print(exon.start, exon.stop)
100 200
>>> exon.contains((150, 160))
True
"""
def __init__(
self,
start: int,
stop: int,
frame: int | None = None,
):
"""Initialize exon model.
Args:
start: The genomic start position of the exon (1-based).
stop (int): The genomic stop position of the exon
(1-based, closed).
frame (Optional[int]): The frame of the exon.
"""
self.start = start
self.stop = stop
self.frame = frame # related to cds
def __repr__(self) -> str:
return f"Exon(start={self.start}; stop={self.stop})"
[docs]
def contains(self, region: tuple[int, int]) -> bool:
"""Check if this exon fully contains a genomic region.
Args:
region (tuple[int, int]): A (start, stop) position tuple to check.
Returns:
bool: True if the region is fully contained within this exon.
Example:
>>> exon = Exon(100, 200)
>>> exon.contains((150, 160))
True
>>> exon.contains((50, 250))
False
"""
start, stop = region
return self.start <= start and self.stop >= stop
[docs]
class TranscriptModel:
"""Represent a transcript with all its structural features.
A transcript model contains complete information about a gene transcript,
including its genomic location, exon structure, coding regions, and
additional attributes from the source annotation.
Attributes:
gene (str): Gene name/symbol (e.g., "TP53").
tr_id (str): Transcript identifier, unique within the gene models.
tr_name (str): Original transcript name from source annotation.
chrom (str): Chromosome name (e.g., "chr17", "17").
strand (str): Strand orientation ("+" or "-").
tx (tuple[int, int]): Transcript start and end positions
(1-based, closed interval).
cds (tuple[int, int]): Coding sequence start and end positions
(1-based, closed interval). For non-coding transcripts,
cds[0] >= cds[1].
exons (list[Exon]): List of Exon objects in genomic order.
attributes (dict[str, Any]): Additional annotation attributes
(e.g., gene_biotype, gene_version).
Example:
>>> from dae.genomic_resources.gene_models.transcript_models import \\
... TranscriptModel, Exon
>>> tm = TranscriptModel(
... gene="TP53",
... tr_id="ENST00000269305",
... tr_name="TP53-201",
... chrom="17",
... strand="-",
... tx=(7661779, 7687550),
... cds=(7668402, 7687490),
... exons=[Exon(7661779, 7661822), Exon(7668402, 7669690)],
... attributes={"gene_biotype": "protein_coding"},
... )
>>> print(f"Coding: {tm.is_coding()}")
Coding: True
>>> regions = tm.cds_regions()
>>> print(f"CDS has {len(regions)} regions")
Note:
- All coordinates use 1-based, closed intervals
- CDS includes both start and stop codons
- Exons should be in genomic order (not necessarily 5' to 3')
"""
def __init__(
self,
gene: str,
tr_id: str,
tr_name: str,
chrom: str,
strand: str, *,
tx: tuple[int, int], # pylint: disable=invalid-name
cds: tuple[int, int],
exons: list[Exon] | None = None,
attributes: dict[str, Any] | None = None,
):
"""Initialize transcript model.
Args:
gene (str): The gene name.
tr_id (str): The transcript ID.
tr_name (str): The transcript name.
chrom (str): The chromosome name.
strand (str): The strand of the transcript.
tx (tuple[int, int]): The transcript start and end positions.
(1-based, closed interval)
cds (tuple[int, int]): The coding region start and end positions.
The CDS region includes the start and stop codons.
(1-based, closed interval)
exons (Optional[list[Exon]]): The list of exons. Defaults to
empty list.
attributes (Optional[dict[str, Any]]): The additional attributes.
Defaults to empty dictionary.
"""
self.gene = gene
self.tr_id = tr_id
self.tr_name = tr_name
self.chrom = chrom
self.strand = strand
self.tx = tx # pylint: disable=invalid-name
self.cds = cds
self.exons: list[Exon] = exons if exons is not None else []
self.attributes = attributes if attributes is not None else {}
[docs]
def is_coding(self) -> bool:
"""Check if this transcript is protein-coding.
Returns:
bool: True if the transcript has a coding region (CDS),
False for non-coding transcripts.
Example:
>>> if transcript.is_coding():
... cds_len = transcript.cds_len()
... print(f"CDS length: {cds_len}bp")
"""
return self.cds[0] < self.cds[1]
[docs]
def cds_regions(self, ss_extend: int = 0) -> list[BedRegion]:
"""Compute coding sequence (CDS) regions.
Extracts the portions of exons that contain coding sequence,
optionally extending into splice sites.
Args:
ss_extend (int): Number of bases to extend into splice sites
at exon boundaries. Default is 0 (no extension).
Returns:
list[BedRegion]: List of BedRegion objects representing CDS
segments. Returns empty list for non-coding transcripts.
Example:
>>> cds_regions = transcript.cds_regions()
>>> for region in cds_regions:
... print(f"{region.chrom}:{region.start}-{region.stop}")
>>> # With splice site extension
>>> extended = transcript.cds_regions(ss_extend=3)
Note:
CDS regions include both start and stop codons. Use
collect_gtf_cds_regions() from serialization module to
exclude the stop codon for GTF format.
"""
if self.cds[0] >= self.cds[1]:
return []
regions = []
k = 0
while self.exons[k].stop < self.cds[0]:
k += 1
if self.cds[1] <= self.exons[k].stop:
regions.append(
BedRegion(
chrom=self.chrom, start=self.cds[0], stop=self.cds[1]),
)
return regions
regions.append(
BedRegion(
chrom=self.chrom,
start=self.cds[0],
stop=self.exons[k].stop + ss_extend,
),
)
k += 1
while k < len(self.exons) and self.exons[k].stop <= self.cds[1]:
if self.exons[k].stop < self.cds[1]:
regions.append(
BedRegion(
chrom=self.chrom,
start=self.exons[k].start - ss_extend,
stop=self.exons[k].stop + ss_extend,
),
)
k += 1
else:
regions.append(
BedRegion(
chrom=self.chrom,
start=self.exons[k].start - ss_extend,
stop=self.exons[k].stop,
),
)
return regions
if k < len(self.exons) and self.exons[k].start <= self.cds[1]:
regions.append(
BedRegion(
chrom=self.chrom,
start=self.exons[k].start - ss_extend,
stop=self.cds[1],
),
)
return regions
[docs]
def utr5_regions(self) -> list[BedRegion]:
"""Get 5' untranslated region (5' UTR) segments.
The 5' UTR extends from the transcription start to the start codon
(translation start). Strand orientation is considered.
Returns:
list[BedRegion]: List of 5' UTR regions. Returns empty list for
non-coding transcripts.
Example:
>>> utr5 = transcript.utr5_regions()
>>> utr5_length = sum(r.stop - r.start + 1 for r in utr5)
>>> print(f"5' UTR: {utr5_length}bp")
Note:
For positive strand: regions before CDS start.
For negative strand: regions after CDS end.
"""
if self.cds[0] >= self.cds[1]:
return []
regions = []
k = 0
if self.strand == "+":
while self.exons[k].stop < self.cds[0]:
regions.append(
BedRegion(
chrom=self.chrom,
start=self.exons[k].start,
stop=self.exons[k].stop,
),
)
k += 1
if self.exons[k].start < self.cds[0]:
regions.append(
BedRegion(
chrom=self.chrom,
start=self.exons[k].start,
stop=self.cds[0] - 1,
),
)
else:
while self.exons[k].stop < self.cds[1] and k < len(self.exons):
k += 1
if self.exons[k].stop == self.cds[1]:
k += 1
else:
regions.append(
BedRegion(
chrom=self.chrom,
start=self.cds[1] + 1,
stop=self.exons[k].stop,
),
)
k += 1
regions.extend([
BedRegion(chrom=self.chrom, start=exon.start, stop=exon.stop)
for exon in self.exons[k:]
])
return regions
[docs]
def utr3_regions(self) -> list[BedRegion]:
"""Get 3' untranslated region (3' UTR) segments.
The 3' UTR extends from the stop codon (translation end) to the
transcription end. Strand orientation is considered.
Returns:
list[BedRegion]: List of 3' UTR regions. Returns empty list for
non-coding transcripts.
Example:
>>> utr3 = transcript.utr3_regions()
>>> utr3_length = sum(r.stop - r.start + 1 for r in utr3)
>>> print(f"3' UTR: {utr3_length}bp")
Note:
For positive strand: regions after CDS end.
For negative strand: regions before CDS start.
"""
if self.cds[0] >= self.cds[1]:
return []
regions = []
k = 0
if self.strand == "-":
while self.exons[k].stop < self.cds[0]:
regions.append(
BedRegion(
chrom=self.chrom,
start=self.exons[k].start,
stop=self.exons[k].stop,
),
)
k += 1
if self.exons[k].start < self.cds[0]:
regions.append(
BedRegion(
chrom=self.chrom,
start=self.exons[k].start,
stop=self.cds[0] - 1,
),
)
else:
while self.exons[k].stop < self.cds[1]:
k += 1
if self.exons[k].stop == self.cds[1]:
k += 1
else:
regions.append(
BedRegion(
chrom=self.chrom,
start=self.cds[1] + 1,
stop=self.exons[k].stop,
),
)
k += 1
regions.extend([
BedRegion(chrom=self.chrom, start=exon.start, stop=exon.stop)
for exon in self.exons[k:]
])
return regions
[docs]
def all_regions(
self, ss_extend: int = 0, prom: int = 0,
) -> list[BedRegion]:
"""Get all transcript regions with optional extensions.
Returns all exonic regions, optionally extending into splice sites
and promoter regions.
Args:
ss_extend (int): Number of bases to extend into splice sites
at coding exon boundaries. Default is 0.
prom (int): Number of bases to extend into promoter region
upstream of transcription start. Default is 0.
Returns:
list[BedRegion]: List of all transcript regions, potentially
extended based on parameters.
Example:
>>> # Basic exonic regions
>>> regions = transcript.all_regions()
>>> # With splice site extension
>>> regions = transcript.all_regions(ss_extend=3)
>>> # With promoter region
>>> regions = transcript.all_regions(prom=2000)
Note:
Promoter extension is strand-aware: extends upstream of the
transcription start (before first exon for +, after last for -).
"""
# pylint:disable=too-many-branches
regions = []
if ss_extend == 0:
regions.extend([
BedRegion(chrom=self.chrom, start=exon.start, stop=exon.stop)
for exon in self.exons
])
else:
for exon in self.exons:
if exon.stop <= self.cds[0]:
regions.append(
BedRegion(
chrom=self.chrom,
start=exon.start, stop=exon.stop),
)
elif exon.start <= self.cds[0]:
if exon.stop >= self.cds[1]:
regions.append(
BedRegion(
chrom=self.chrom,
start=exon.start, stop=exon.stop),
)
else:
regions.append(
BedRegion(
chrom=self.chrom,
start=exon.start,
stop=exon.stop + ss_extend,
),
)
elif exon.start > self.cds[1]:
regions.append(
BedRegion(
chrom=self.chrom,
start=exon.start, stop=exon.stop),
)
else:
if exon.stop >= self.cds[1]:
regions.append(
BedRegion(
chrom=self.chrom,
start=exon.start - ss_extend,
stop=exon.stop,
),
)
else:
regions.append(
BedRegion(
chrom=self.chrom,
start=exon.start - ss_extend,
stop=exon.stop + ss_extend,
),
)
if prom != 0:
if self.strand == "+":
regions[0] = BedRegion(
chrom=regions[0].chrom,
start=regions[0].start - prom,
stop=regions[0].stop,
)
else:
regions[-1] = BedRegion(
chrom=regions[-1].chrom,
start=regions[-1].start,
stop=regions[-1].stop + prom,
)
return regions
[docs]
def total_len(self) -> int:
length = 0
for reg in self.exons:
length += reg.stop - reg.start + 1
return length
[docs]
def cds_len(self) -> int:
regions = self.cds_regions()
length = 0
for reg in regions:
length += reg.stop - reg.start + 1
return length
[docs]
def utr3_len(self) -> int:
utr3 = self.utr3_regions()
length = 0
for reg in utr3:
length += reg.stop - reg.start + 1
return length
[docs]
def utr5_len(self) -> int:
utr5 = self.utr5_regions()
length = 0
for reg in utr5:
length += reg.stop - reg.start + 1
return length
[docs]
def calc_frames(self) -> list[int]:
"""Calculate reading frame for each exon.
Computes the codon reading frame (0, 1, or 2) for each exon based
on the CDS coordinates and strand orientation.
Returns:
list[int]: Reading frame for each exon. Values are:
- 0, 1, or 2 for coding exons (bases into current codon)
- -1 for non-coding exons or non-coding transcripts
Example:
>>> frames = transcript.calc_frames()
>>> for exon, frame in zip(transcript.exons, frames):
... if frame >= 0:
... print(f"Exon {exon.start}-{exon.stop}: frame {frame}")
Note:
Frame calculation is strand-aware and considers exon order.
Use update_frames() to set frame attribute on Exon objects.
"""
length = len(self.exons)
fms = []
if self.cds[0] > self.cds[1]:
fms = [-1] * length
elif self.strand == "+":
k = 0
while self.exons[k].stop < self.cds[0]:
fms.append(-1)
k += 1
fms.append(0)
if self.exons[k].stop < self.cds[1]:
fms.append((self.exons[k].stop - self.cds[0] + 1) % 3)
k += 1
while self.exons[k].stop < self.cds[1] and k < length:
fms.append(
(fms[k] +
self.exons[k].stop - self.exons[k].start + 1) % 3,
)
k += 1
fms += [-1] * (length - len(fms))
else:
k = length - 1
while self.exons[k].start > self.cds[1]:
fms.append(-1)
k -= 1
fms.append(0)
if self.cds[0] < self.exons[k].start:
fms.append((self.cds[1] - self.exons[k].start + 1) % 3)
k -= 1
while self.cds[0] < self.exons[k].start and k > -1:
fms.append(
(fms[-1] + self.exons[k].stop - self.exons[k].start + 1)
% 3,
)
k -= 1
fms += [-1] * (length - len(fms))
fms = fms[::-1]
assert len(self.exons) == len(fms)
return fms
[docs]
def update_frames(self) -> None:
"""Update the frame attribute of all exons.
Calculates reading frames using calc_frames() and updates the
frame attribute of each Exon object.
Example:
>>> transcript.update_frames()
>>> for exon in transcript.exons:
... print(f"Exon frame: {exon.frame}")
Note:
This modifies the Exon objects in place.
"""
frames = self.calc_frames()
for exon, frame in zip(self.exons, frames, strict=True):
exon.frame = frame
[docs]
def test_frames(self) -> bool:
"""Verify that exon frames are correctly set.
Compares the frame attribute of each exon with the calculated
frame to ensure consistency.
Returns:
bool: True if all exon frames match calculated values,
False otherwise.
Example:
>>> transcript.update_frames()
>>> assert transcript.test_frames()
"""
frames = self.calc_frames()
for exon, frame in zip(self.exons, frames, strict=True):
if exon.frame != frame:
return False
return True