Skip to content

Commit

Permalink
feat!: remove UtaDatabase.get_tx_exons + cleanup exon work (#347)
Browse files Browse the repository at this point in the history
addresses #224

* initial work for cleaning up exon coord data retrieval
  • Loading branch information
korikuzma committed Aug 21, 2024
1 parent cc6cf81 commit 9b8228f
Show file tree
Hide file tree
Showing 5 changed files with 266 additions and 162 deletions.
208 changes: 120 additions & 88 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
import logging
from typing import Literal, TypeVar

from pydantic import Field, StrictInt

from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
from cool_seq_tool.mappers.liftover import LiftOver
from cool_seq_tool.mappers.mane_transcript import CdnaRepresentation, ManeTranscript
from cool_seq_tool.schemas import (
AnnotationLayer,
Assembly,
BaseModelForbidExtra,
CoordinateType,
GenomicData,
GenomicDataResponse,
Expand All @@ -27,6 +30,26 @@
_logger = logging.getLogger(__name__)


class ExonCoord(BaseModelForbidExtra):
"""Model for representing exon coordinate data"""

ord: StrictInt = Field(..., description="Exon number. 0-based.")
tx_start_i: StrictInt = Field(
...,
description="Transcript start index of the exon. Inter-residue coordinates.",
)
tx_end_i: StrictInt = Field(
..., description="Transcript end index of the exon. Inter-residue coordinates."
)
alt_start_i: StrictInt = Field(
..., description="Genomic start index of the exon. Inter-residue coordinates."
)
alt_end_i: StrictInt = Field(
..., description="Genomic end index of the exon. Inter-residue coordinates."
)
alt_strand: Strand = Field(..., description="Strand.")


class ExonGenomicCoordsMapper:
"""Provide capabilities for mapping transcript exon representation to/from genomic
coordinate representation.
Expand Down Expand Up @@ -156,18 +179,16 @@ async def tx_segment_to_genomic(
if warnings:
return self._return_warnings(resp, warnings)

# Get all exons and associated start/end coordinates for transcript
tx_exons, warning = await self.uta_db.get_tx_exons(transcript)
if not tx_exons:
return self._return_warnings(resp, [warning] if warning else [])

# Get exon start and exon end coordinates
tx_exon_coords, warning = self.get_tx_exon_coords(
transcript, tx_exons, exon_start, exon_end
(
tx_exon_start_coords,
tx_exon_end_coords,
errors,
) = await self._get_start_end_exon_coords(
transcript, exon_start=exon_start, exon_end=exon_end
)
if not tx_exon_coords:
return self._return_warnings(resp, [warning] if warning else [])
tx_exon_start_coords, tx_exon_end_coords = tx_exon_coords
if errors:
return self._return_warnings(resp, errors)

if gene:
gene = gene.upper().strip()
Expand Down Expand Up @@ -373,67 +394,89 @@ async def genomic_to_tx_segment(
resp.genomic_data = GenomicData(**params)
return resp

@staticmethod
def _validate_exon(
transcript: str, tx_exons: list[tuple[int, int]], exon_number: int
) -> tuple[tuple[int, int] | None, str | None]:
"""Validate that exon number exists on a given transcript
async def _get_all_exon_coords(
self, tx_ac: str, genomic_ac: str | None = None
) -> list[ExonCoord]:
"""Get all exon coordinate data for a transcript.
:param transcript: Transcript accession
:param tx_exons: List of transcript's exons and associated coordinates
:param exon_number: Exon number to validate
:return: Exon coordinates for a given exon number and warnings if found
"""
msg = f"Exon {exon_number} does not exist on {transcript}"
try:
if exon_number < 1:
return None, msg
exon = tx_exons[exon_number - 1]
except IndexError:
return None, msg
return exon, None
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
associated to ``tx_ac``.
def get_tx_exon_coords(
:param tx_ac: The RefSeq transcript accession to get exon data for.
:param genomic_ac: The RefSeq genomic accession to get exon data for.
:return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
The exon coordinate data will include the exon number, transcript and
genomic positions for the start and end of the exon, and strand.
"""
if genomic_ac:
query = f"""
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
FROM {self.uta_db.schema}.tx_exon_aln_v
WHERE tx_ac = '{tx_ac}'
AND alt_aln_method = 'splign'
AND alt_ac = '{genomic_ac}'
""" # noqa: S608
else:
query = f"""
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
FROM {self.uta_db.schema}.tx_exon_aln_v as t
INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
ON t.alt_ac = s.ac
WHERE s.descr = ''
AND t.tx_ac = '{tx_ac}'
AND t.alt_aln_method = 'splign'
AND t.alt_ac like 'NC_000%'
""" # noqa: S608

results = await self.uta_db.execute_query(query)
return [ExonCoord(**r) for r in results]

async def _get_start_end_exon_coords(
self,
transcript: str,
tx_exons: list[tuple[int, int]],
tx_ac: str,
exon_start: int | None = None,
exon_end: int | None = None,
) -> tuple[
tuple[tuple[int, int] | None, tuple[int, int] | None] | None,
str | None,
]:
"""Get exon coordinates for ``exon_start`` and ``exon_end``
:param transcript: Transcript accession
:param tx_exons: List of all transcript exons and coordinates
:param exon_start: Start exon number
:param exon_end: End exon number
:return: [Transcript start exon coords, Transcript end exon coords],
and warnings if found
genomic_ac: str | None = None,
) -> tuple[ExonCoord | None, ExonCoord | None, list[str]]:
"""Get exon coordinates for a transcript given exon start and exon end.
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
associated to ``tx_ac``.
:param tx_ac: The RefSeq transcript accession to get exon data for.
:param exon_start: Start exon number to get coordinate data for. 1-based.
:param exon_end: End exon number to get coordinate data for. 1-based.
:param genomic_ac: The RefSeq genomic accession to get exon data for.
:return: Tuple containing start exon coordinate data, end exon coordinate data,
and list of errors. The exon coordinate data will include the exon number,
transcript and genomic positions for the start and end of the exon, and
strand.
"""
if exon_start is not None:
tx_exon_start, warning = self._validate_exon(
transcript, tx_exons, exon_start
)
if not tx_exon_start:
return None, warning
else:
tx_exon_start = None
tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=genomic_ac)
if not tx_exons:
return None, None, [f"No exons found given {tx_ac}"]

if exon_end is not None:
tx_exon_end, warning = self._validate_exon(transcript, tx_exons, exon_end)
if not tx_exon_end:
return None, warning
else:
tx_exon_end = None
return (tx_exon_start, tx_exon_end), None
errors = []
start_end_exons = []
for exon_num in [exon_start, exon_end]:
if exon_num is not None:
try:
start_end_exons.append(tx_exons[exon_num - 1])
continue
except IndexError:
errors.append(f"Exon {exon_num} does not exist on {tx_ac}")
start_end_exons.append(None)

if errors:
start_end_exons = [None, None]

return *start_end_exons, errors

async def _get_alt_ac_start_and_end(
self,
tx_ac: str,
tx_exon_start: tuple[int, int] | None = None,
tx_exon_end: tuple[int, int] | None = None,
tx_exon_start: ExonCoord | None = None,
tx_exon_end: ExonCoord | None = None,
gene: str | None = None,
) -> tuple[tuple[tuple[int, int], tuple[int, int]] | None, str | None]:
"""Get aligned genomic coordinates for transcript exon start and end.
Expand All @@ -455,7 +498,7 @@ async def _get_alt_ac_start_and_end(
for exon, key in [(tx_exon_start, "start"), (tx_exon_end, "end")]:
if exon:
alt_ac_val, warning = await self.uta_db.get_alt_ac_start_or_end(
tx_ac, exon[0], exon[1], gene=gene
tx_ac, exon.tx_start_i, exon.tx_end_i, gene=gene
)
if alt_ac_val:
alt_ac_data[key] = alt_ac_val
Expand Down Expand Up @@ -745,14 +788,17 @@ async def _set_mane_genomic_data(
if mane_data.ensembl
else None
)
tx_exons = await self._structure_exons(params["transcript"], alt_ac=alt_ac)

tx_exons = await self._get_all_exon_coords(
params["transcript"], genomic_ac=alt_ac
)
if not tx_exons:
return f"Unable to get exons for {params['transcript']}"
return f"No exons found given {params['transcript']}"
tx_pos = mane_data.pos[0] + mane_data.coding_start_site
params["exon"] = self._get_exon_number(tx_exons, tx_pos)

try:
tx_exon = tx_exons[params["exon"] - 1]
tx_exon: ExonCoord = tx_exons[params["exon"] - 1]
except IndexError:
msg = (
f"{params['transcript']} with position {tx_pos} "
Expand All @@ -765,8 +811,8 @@ async def _set_mane_genomic_data(
params["strand"] = strand_to_use
self._set_exon_offset(
params,
tx_exon[0],
tx_exon[1],
tx_exon.tx_start_i,
tx_exon.tx_end_i,
tx_pos,
is_start=is_start,
strand=strand_to_use,
Expand Down Expand Up @@ -825,9 +871,11 @@ async def _set_genomic_data(
params["pos"] = liftover_data[1]
params["chr"] = grch38_ac

tx_exons = await self._structure_exons(params["transcript"], alt_ac=grch38_ac)
tx_exons = await self._get_all_exon_coords(
params["transcript"], genomic_ac=grch38_ac
)
if not tx_exons:
return f"Unable to get exons for {params['transcript']}"
return f"No exons found given {params['transcript']}"

data = await self.uta_db.get_tx_exon_aln_v_data(
params["transcript"],
Expand All @@ -848,13 +896,13 @@ async def _set_genomic_data(
i = 1
found_tx_exon = False
for exon in tx_exons:
if data_exons == exon:
if data_exons == (exon.tx_start_i, exon.tx_end_i):
found_tx_exon = True
break
i += 1
if not found_tx_exon:
# Either first or last
i = 1 if data_exons == (0, tx_exons[0][1]) else i - 1
i = 1 if data_exons == (0, tx_exons[0].tx_end_i) else i - 1
params["exon"] = i

strand_to_use = strand if strand is not None else Strand(data[7])
Expand Down Expand Up @@ -897,24 +945,8 @@ def _set_exon_offset(
else:
params["exon_offset"] = pos - start

async def _structure_exons(
self, transcript: str, alt_ac: str | None = None
) -> list[tuple[int, int]]:
"""Structure exons as list of tuples.
:param transcript: Transcript accession
:param alt_ac: Genomic accession
:return: List of tuples containing transcript exon coordinates
"""
tx_exons, _ = await self.uta_db.get_tx_exons(transcript, alt_ac=alt_ac)

if not tx_exons:
return []

return [(coords[0], coords[1]) for coords in tx_exons]

@staticmethod
def _get_exon_number(tx_exons: list, tx_pos: int) -> int:
def _get_exon_number(tx_exons: list[ExonCoord], tx_pos: int) -> int:
"""Find related exon number for a position
:param tx_exons: List of exon coordinates for a transcript
Expand All @@ -923,7 +955,7 @@ def _get_exon_number(tx_exons: list, tx_pos: int) -> int:
"""
i = 1
for coords in tx_exons:
if coords[0] <= tx_pos <= coords[1]:
if coords.tx_start_i <= tx_pos <= coords.tx_end_i:
break
i += 1
return i
Expand Down
40 changes: 0 additions & 40 deletions src/cool_seq_tool/sources/uta_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,46 +297,6 @@ async def get_genes_and_alt_acs(
alt_acs.add(r[1])
return GenesGenomicAcs(genes=genes, alt_acs=alt_acs), None

async def get_tx_exons(
self, tx_ac: str, alt_ac: str | None = None
) -> tuple[list[tuple[int, int]] | None, str | None]:
"""Get list of transcript exons start/end coordinates.
:param tx_ac: Transcript accession
:param alt_ac: Genomic accession
:return: List of a transcript's accessions and warnings if found
"""
if alt_ac:
# We know what assembly we're looking for since we have the
# genomic accession
query = f"""
SELECT DISTINCT tx_start_i, tx_end_i
FROM {self.schema}.tx_exon_aln_v
WHERE tx_ac = '{tx_ac}'
AND alt_aln_method = 'splign'
AND alt_ac = '{alt_ac}'
""" # noqa: S608
else:
# Use GRCh38 by default if no genomic accession is provided
query = f"""
SELECT DISTINCT tx_start_i, tx_end_i
FROM {self.schema}.tx_exon_aln_v as t
INNER JOIN {self.schema}._seq_anno_most_recent as s
ON t.alt_ac = s.ac
WHERE s.descr = ''
AND t.tx_ac = '{tx_ac}'
AND t.alt_aln_method = 'splign'
AND t.alt_ac like 'NC_000%'
""" # noqa: S608
result = await self.execute_query(query)

if not result:
msg = f"Unable to get exons for {tx_ac}"
_logger.warning(msg)
return None, msg
tx_exons = [(r["tx_start_i"], r["tx_end_i"]) for r in result]
return tx_exons, None

async def get_tx_exons_genomic_coords(
self,
tx_ac: str,
Expand Down
Loading

0 comments on commit 9b8228f

Please sign in to comment.