Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Check if MANE transcript exists in UTA, if not select longest compatible #378

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/cool_seq_tool/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def __init__(
self.ex_g_coords_mapper = ExonGenomicCoordsMapper(
self.seqrepo_access,
self.uta_db,
self.mane_transcript,
self.mane_transcript_mappings,
self.liftover,
)
110 changes: 71 additions & 39 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
from cool_seq_tool.mappers.liftover import LiftOver
from cool_seq_tool.mappers.mane_transcript import ManeTranscript
from cool_seq_tool.schemas import (
AnnotationLayer,
Assembly,
BaseModelForbidExtra,
ServiceMeta,
Expand Down Expand Up @@ -232,6 +234,7 @@ def __init__(
self,
seqrepo_access: SeqRepoAccess,
uta_db: UtaDatabase,
mane_transcript: ManeTranscript,
jarbesfeld marked this conversation as resolved.
Show resolved Hide resolved
mane_transcript_mappings: ManeTranscriptMappings,
liftover: LiftOver,
) -> None:
Expand All @@ -257,10 +260,12 @@ def __init__(
:param seqrepo_access: SeqRepo instance to give access to query SeqRepo database
:param uta_db: UtaDatabase instance to give access to query UTA database
:param mane_transcript_mappings: Instance to provide access to ManeTranscriptMappings class
:param mane_transcript: Instance to provide access to the ManeTranscript class
:param liftover: Instance to provide mapping between human genome assemblies
"""
self.seqrepo_access = seqrepo_access
self.uta_db = uta_db
self.mane_transcript = mane_transcript
self.mane_transcript_mappings = mane_transcript_mappings
self.liftover = liftover

Expand Down Expand Up @@ -796,37 +801,14 @@ async def _genomic_to_tx_segment(
mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data(
gene
)

if mane_transcripts:
if mane_transcripts and await self.uta_db.validate_mane_transcript_ac(
mane_transcripts
):
transcript = mane_transcripts[0]["RefSeq_nuc"]
else:
# Attempt to find a coding transcript if a MANE transcript
# cannot be found
results = await self.uta_db.get_transcripts(
gene=gene, alt_ac=genomic_ac
transcript = await self._select_optimal_transcript(
jarbesfeld marked this conversation as resolved.
Show resolved Hide resolved
genomic_pos, genomic_ac, gene
)

if not results.is_empty():
transcript = results[0]["tx_ac"][0]
else:
# Run if gene is for a noncoding transcript
query = f"""
SELECT DISTINCT tx_ac
FROM {self.uta_db.schema}.tx_exon_aln_v
WHERE hgnc = '{gene}'
AND alt_ac = '{genomic_ac}'
""" # noqa: S608
result = await self.uta_db.execute_query(query)

if result:
transcript = result[0]["tx_ac"]
else:
return GenomicTxSeg(
errors=[
f"Could not find a transcript for {gene} on {genomic_ac}"
]
)

tx_exons = await self._get_all_exon_coords(
tx_ac=transcript, genomic_ac=genomic_ac
)
Expand Down Expand Up @@ -934,6 +916,48 @@ async def _genomic_to_tx_segment(
genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript
)

async def _select_optimal_transcript(
self, genomic_pos: int, genomic_ac: str, gene: str
) -> str:
"""Select the optimal transcript given a genomic position, accession, and gene.
jarbesfeld marked this conversation as resolved.
Show resolved Hide resolved
In this case, optimal refers to the transcript that is the best represenative
transcript for this data, given that MANE data does not exist for the
gene or the MANE transcript for the gene does not exist in UTA

:param genomic_pos: Genomic position where the transcript segment starts or ends
(inter-residue based)
:param genomic_ac: Genomic accession
:param gene: Valid, case-sensitive HGNC gene symbol
:return A string representing the optimal transcript given the above data
"""
# Attempt to find a coding transcript if a MANE transcript cannot be found
transcript = None
results = await self.mane_transcript.get_longest_compatible_transcript(
start_pos=genomic_pos,
end_pos=genomic_pos,
start_annotation_layer=AnnotationLayer.GENOMIC,
gene=gene,
)
if results:
transcript = results.refseq
else:
# Run if gene is for a noncoding transcript
query = f"""
SELECT DISTINCT tx_ac
FROM {self.uta_db.schema}.tx_exon_aln_v
WHERE hgnc = '{gene}'
AND alt_ac = '{genomic_ac}'
""" # noqa: S608
result = await self.uta_db.execute_query(query)

if result:
transcript = result[0]["tx_ac"]
jarbesfeld marked this conversation as resolved.
Show resolved Hide resolved
else:
return GenomicTxSeg(
errors=[f"Could not find a transcript for {gene} on {genomic_ac}"]
)
return transcript

async def _get_grch38_ac_pos(
self, genomic_ac: str, genomic_pos: int, grch38_ac: str | None = None
) -> tuple[str | None, int | None, str | None]:
Expand Down Expand Up @@ -1065,24 +1089,32 @@ async def _get_tx_seg_genomic_metadata(
transcript
:return: Transcript segment data and associated genomic metadata
"""
if tx_ac:
# We should always try to liftover
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
if not grch38_ac:
return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
grch38_ac = grch38_ac[0]
else:
# We should always try to liftover
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
if not grch38_ac:
return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
grch38_ac = grch38_ac[0]

non_mane_transcript = None

if not tx_ac:
mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
if not mane_data:
err_msg = f"Unable to find mane data for {genomic_ac} with position {genomic_pos}"
if gene:
err_msg += f" on gene {gene}"
_logger.warning(err_msg)
return GenomicTxSeg(errors=[err_msg])
if not await self.uta_db.validate_mane_transcript_ac(mane_data):
non_mane_transcript = await self._select_optimal_transcript(
genomic_pos, genomic_ac, gene
)

mane_data = mane_data[0]
tx_ac = mane_data["RefSeq_nuc"]
grch38_ac = mane_data["GRCh38_chr"]
if non_mane_transcript:
tx_ac = non_mane_transcript
else:
mane_data = mane_data[0]
tx_ac = mane_data["RefSeq_nuc"]
grch38_ac = mane_data["GRCh38_chr"]

# Always liftover to GRCh38
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
Expand Down
17 changes: 17 additions & 0 deletions src/cool_seq_tool/sources/uta_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,23 @@ async def validate_genomic_ac(self, ac: str) -> bool:
result = await self.execute_query(query)
return result[0][0]

async def validate_mane_transcript_ac(self, mane_transcripts: list[dict]) -> bool:
"""Return whether or not a MANE transcript exists in UTA. This looks at the
first entry in the MANE transcripts list as this is the higher priority
transcript.

:param transcript_list: A list of MANE transcripts
:return ``True`` if transcript accession exists in UTA. ``False`` otherwise
"""
transcript = mane_transcripts[0]["RefSeq_nuc"]
query = f"""
SELECT *
FROM {self.schema}.tx_exon_aln_v
WHERE tx_ac = '{transcript}'
""" # noqa: S608
results = await self.execute_query(query)
return bool(results)

async def get_ac_descr(self, ac: str) -> str | None:
"""Return accession description. This is typically available only for accessions
from older (pre-GRCh38) builds.
Expand Down
9 changes: 9 additions & 0 deletions tests/mappers/test_exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,15 @@ async def test_genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8):
)
genomic_tx_seg_checks(resp, tpm3_exon8)

# Check if transcript exists in UTA. If not, return longest compatible transcript
resp = await test_egc_mapper._genomic_to_tx_segment(
22513522,
genomic_ac="NC_000016.10",
is_seg_start=False,
gene="NPIPB5",
)
assert resp.tx_ac == "NM_001135865.2"


@pytest.mark.asyncio()
async def test_tpm3(
Expand Down