From ef342d4fc357fda7623dcae16ba4d539c7460696 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <50678786+jarbesfeld@users.noreply.github.com> Date: Mon, 27 Jan 2025 09:33:21 -0500 Subject: [PATCH] feat!: Change breakpoint validation function (#397) --- .../mappers/exon_genomic_coords.py | 34 ++++++++----------- tests/mappers/test_exon_genomic_coords.py | 2 +- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 40756fb..723037f 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -865,14 +865,14 @@ async def _genomic_to_tx_segment( if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE: genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue - # Validate that the breakpoint occurs on a transcript given a gene - coordinate_check = await self._validate_gene_coordinates( - pos=genomic_pos, genomic_ac=genomic_ac, gene=gene + # Validate that the breakpoint between the first and last exon for the selected transcript + coordinate_check = await self._validate_genomic_breakpoint( + pos=genomic_pos, genomic_ac=genomic_ac, tx_ac=transcript ) if not coordinate_check: return GenomicTxSeg( errors=[ - f"{genomic_pos} on {genomic_ac} does not occur within the exons for {gene}" + f"{genomic_pos} on {genomic_ac} does not occur within the exons for {transcript}" ] ) @@ -943,38 +943,32 @@ async def _get_grch38_pos( ) return liftover_data[1] if liftover_data else None - async def _validate_gene_coordinates( + async def _validate_genomic_breakpoint( self, pos: int, genomic_ac: str, - gene: str, + tx_ac: str, ) -> bool: """Validate that a genomic coordinate falls within the first and last exon - given a gene and accession + for a transcript on a given accession :param pos: Genomic position on ``genomic_ac`` :param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"`` - :param gene: A valid, case-sensitive HGNC gene symbol + :param transcript: A transcript accession :return: ``True`` if the coordinate falls within the first and last exon - for the gene, ``False`` if not + for the transcript, ``False`` if not """ query = f""" WITH tx_boundaries AS ( - SELECT - tx_ac, - hgnc, - MIN(alt_start_i) as min_start, - MAX(alt_end_i) as max_end + SELECT + MIN(alt_start_i) AS min_start, + MAX(alt_end_i) AS max_end FROM {self.uta_db.schema}.tx_exon_aln_v - WHERE hgnc = '{gene}' + WHERE tx_ac = '{tx_ac}' AND alt_ac = '{genomic_ac}' - GROUP BY tx_ac, hgnc ) - SELECT DISTINCT hgnc - FROM tx_boundaries + SELECT * FROM tx_boundaries WHERE {pos} between tx_boundaries.min_start and tx_boundaries.max_end - ORDER BY hgnc - LIMIT 1; """ # noqa: S608 results = await self.uta_db.execute_query(query) return bool(results) diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index 57f277c..8225e23 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -1516,7 +1516,7 @@ async def test_invalid(test_egc_mapper): ) genomic_tx_seg_service_checks(resp, is_valid=False) assert resp.errors == [ - "9999999999998 on NC_000001.11 does not occur within the exons for TPM3" + "9999999999998 on NC_000001.11 does not occur within the exons for NM_152263.3" ] # Must supply either gene or transcript