[docs]classReferenceGenome(ResourceConfigValidationMixin,):"""Provides an interface for quering a reference genome."""def__init__(self,resource:GenomicResource):self.resource=resourceifresource.get_type()!="genome":raiseValueError(f"wrong type of resource passed: {resource.get_type()}")self._index:dict[str,Any]={}self._chromosomes:list[str]=[]self._chromosome_lengths:dict[str,int]={}self._sequence:IO|None=Noneself.pars:dict=self._parse_pars(resource.get_config())@propertydefresource_id(self)->str:returnself.resource.resource_id@staticmethoddef_parse_pars(config:dict[str,Any])->dict:if"PARS"notinconfig:return{}assertconfig["PARS"]["X"]isnotNoneregions_x=[Region.from_str(region)forregioninconfig["PARS"]["X"]]chrom_x=regions_x[0].chromresult={chrom_x:regions_x,}ifconfig["PARS"]["Y"]isnotNone:regions_y=[Region.from_str(region)forregioninconfig["PARS"]["Y"]]chrom_y=regions_y[0].chromresult[chrom_y]=regions_yreturnresult@propertydefchromosomes(self)->list[str]:"""Return a list of all chromosomes of the reference genome."""self._load_genome_index()returnself._chromosomes@propertydefchrom_prefix(self)->str:"""Return a prefix of all chromosomes of the reference genome."""self._load_genome_index()chrom=self._chromosomes[0]ifchrom.startswith("chr"):return"chr"return""def_load_genome_index(self)->None:ifself._index:returnconfig=self.resource.get_config()file_name=config["filename"]index_file_name=config.get("index_file",f"{file_name}.fai")index_content=self.resource.get_file_content(index_file_name)self._parse_genome_index(index_content)def_parse_genome_index(self,index_content:str)->None:forlineinindex_content.split("\n"):line=line.strip()ifnotline:breakrec=line.split()self._index[rec[0]]={"length":int(rec[1]),"startBit":int(rec[2]),"seqLineLength":int(rec[3]),"lineLength":int(rec[4]),}self._chromosome_lengths={chrom:data["length"]forchrom,datainself._index.items()}self._chromosomes=list(self._chromosome_lengths.keys())
def__enter__(self)->ReferenceGenome:returnselfdef__exit__(self,exc_type:type[BaseException]|None,exc_value:BaseException|None,exc_tb:TracebackType|None,)->None:ifexc_typeisnotNone:logger.error("exception while using reference genome: %s, %s, %s",exc_type,exc_value,exc_tb)try:self.close()exceptException:# pylint: disable=broad-exceptlogger.exception("exception during closing reference genome")
[docs]defget_chrom_length(self,chrom:str)->int:"""Return the length of a specified chromosome."""self._load_genome_index()ifchromnotinself._chromosome_lengths:raiseValueError(f"can't find chromosome {chrom}")returnself._chromosome_lengths[chrom]
[docs]defget_all_chrom_lengths(self)->dict[str,int]:"""Return list of all chromosomes lengths."""self._load_genome_index()returnself._chromosome_lengths
[docs]defsplit_into_regions(self,region_size:int,chromosome:str|None=None,)->Generator[Region,None,None]:""" Split the reference genome into regions and yield them. Can specify a specific chromosome to limit the regions to be in that chromosome only. """ifchromosomeisNone:chromosome_lengths=list(self.get_all_chrom_lengths().items())else:chromosome_lengths=[(chromosome,self.get_chrom_length(chromosome)),]ifregion_size==0:yield from[Region(chrom,1)forchrom,_inchromosome_lengths]returnforchrom,chrom_leninchromosome_lengths:logger.debug("Chromosome '%s' has length %s",chrom,chrom_len)i=1whilei<chrom_len-region_size:yieldRegion(chrom,i,i+region_size-1)i+=region_sizeyieldRegion(chrom,i,None)
[docs]deffetch(self,chrom:str,start:int,stop:int|None,buffer_size:int=512,)->Generator[str,None,None]:""" Yield the nucleotides in a specific region. While line feed calculation can be inaccurate because not every fetch will start at the start of a line, line feeds add extra characters to read and the output is limited by the amount of nucleotides expected to be read. """ifchromnotinself.chromosomes:logger.warning("chromosome %s not found in %s",chrom,self.resource.resource_id)returnassertself._sequenceisnotNoneself._sequence.seek(self._index[chrom]["startBit"]+start-1+(start-1)//self._index[chrom]["seqLineLength"],)chrom_length=self.get_chrom_length(chrom)ifstopisNone:length=chrom_length-start+1else:length=min(stop,chrom_length)-start+1line_feeds=1+length//self._index[chrom]["seqLineLength"]total_length=length+line_feedsread_progress=0whileread_progress<length:read_length=min(buffer_size,total_length-read_progress)sequence=self._sequence.read(read_length).decode("ascii")sequence=sequence.replace("\n","").upper()end=min(read_progress+read_length,length-read_progress)sequence=sequence[:end]yield fromsequenceread_progress+=len(sequence)return
[docs]defget_sequence(self,chrom:str,start:int,stop:int)->str:"""Return sequence of nucleotides from specified chromosome region."""return"".join(self.fetch(chrom,start,stop))
[docs]defis_pseudoautosomal(self,chrom:str,pos:int)->bool:"""Return true if specified position is pseudoautosomal."""defin_any_region(chrom:str,pos:int,regions:list[Region])->bool:returnany(regforreginregionsifreg.isin(chrom,pos))pars_regions=self.pars.get(chrom,None)ifpars_regions:returnin_any_region(chrom,pos,pars_regions)returnFalse
[docs]defbuild_reference_genome_from_file(filename:str)->ReferenceGenome:"""Open a reference genome from a file."""dirname=os.path.dirname(filename)basename=os.path.basename(filename)res=build_local_resource(dirname,{"type":"genome","filename":basename,})returnbuild_reference_genome_from_resource(res)
[docs]defbuild_reference_genome_from_resource(resource:GenomicResource)->ReferenceGenome:"""Open a reference genome from resource."""ifresource.get_type()!="genome":logger.error("trying to open a resource %s of type ""%s as reference genome",resource.resource_id,resource.get_type())raiseValueError(f"wrong resource type: {resource.resource_id}")returnReferenceGenome(resource)