Merge pull request #134 from GenomicMedLab/staging

Staging
GenomicMedLab · Mar 22, 2023 · 66ba20a · 66ba20a
2 parents 4456bfd + b375a20
commit 66ba20a
Show file tree

Hide file tree

Showing 16 changed files with 589 additions and 446 deletions.
diff --git a/README.md b/README.md
@@ -71,6 +71,8 @@ If you do not wish to use the default, you must set the environment variable `UT
 #### SeqRepo
 `cool-seq-tool` relies on [seqrepo](https://github.com/biocommons/biocommons.seqrepo), which you must download yourself.
 
+Use the `SEQREPO_DATA_PATH` environment variable to set the path of an already existing SeqRepo directory. The default is `/usr/local/share/seqrepo/latest`.
+
 From the _root_ directory:
 ```
 pip install seqrepo
@@ -96,13 +98,15 @@ exit
 
 ![image](biomart.png)
 
+Use the `TRANSCRIPT_MAPPINGS_PATH` environment variable to set the path of an already existing `transcript_mappings.tsv`. The default is `cool_seq_tool/data/transcript_mapping.tsv`.
+
 #### LRG_RefSeqGene
 
-`cool-seq-tool` fetches the latest version of `LRG_RefSeqGene`. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene).
+`cool-seq-tool` fetches the latest version of `LRG_RefSeqGene` if the environment variable `LRG_REFSEQGENE_PATH` is not set. When `LRG_REFSEQGENE_PATH` is set, `cool-seq-tool` will look at this path and expect the LRG_RefSeqGene file. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene).
 
 #### MANE Summary Data
 
-`cool-seq-tool` fetches the latest version of `MANE.GRCh38.*.summary.txt.gz`. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/).
+`cool-seq-tool` fetches the latest version of `MANE.GRCh38.*.summary.txt.gz` if the environment variable `MANE_SUMMARY_PATH` is not set. When `MANE_SUMMARY_PATH` is set, `cool-seq-tool` will look at this path and expect the MANE Summary Data file. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/).
 
 ## Starting the UTA Tools Service Locally
 

diff --git a/cool_seq_tool/__init__.py b/cool_seq_tool/__init__.py
@@ -14,21 +14,27 @@
 
 LOG_FN = "cool_seq_tool.log"
 
-if "UTA_DB_URL" in environ:
-    UTA_DB_URL = environ["UTA_DB_URL"]
-else:
-    UTA_DB_URL = "postgresql://uta_admin@localhost:5433/uta/uta_20210129"
-
-if "SEQREPO_DATA_PATH" in environ:
-    SEQREPO_DATA_PATH = environ["SEQREPO_DATA_PATH"]
-else:
-    SEQREPO_DATA_PATH = "/usr/local/share/seqrepo/latest"
-
-TRANSCRIPT_MAPPINGS_PATH = f"{APP_ROOT}/data/transcript_mapping.tsv"
-
-from cool_seq_tool.data import DataDownload  # noqa: E402, I202
-d = DataDownload()
-MANE_SUMMARY_PATH = d._mane_summary_path
-LRG_REFSEQGENE_PATH = d._lrg_refseqgene_path
+UTA_DB_URL = environ.get("UTA_DB_URL",
+                         "postgresql://uta_admin@localhost:5433/uta/uta_20210129")
+SEQREPO_DATA_PATH = Path(environ.get("SEQREPO_DATA_PATH",
+                                     "/usr/local/share/seqrepo/latest"))
+TRANSCRIPT_MAPPINGS_PATH = Path(environ.get("TRANSCRIPT_MAPPINGS_PATH",
+                                            f"{APP_ROOT}/data/transcript_mapping.tsv"))
+
+
+MANE_SUMMARY_PATH = environ.get("MANE_SUMMARY_PATH")
+LRG_REFSEQGENE_PATH = environ.get("LRG_REFSEQGENE_PATH")
+if not all((MANE_SUMMARY_PATH, LRG_REFSEQGENE_PATH)):
+    from cool_seq_tool.data import DataDownload  # noqa: E402, I202
+    d = DataDownload()
+
+    if not MANE_SUMMARY_PATH:
+        MANE_SUMMARY_PATH = d._mane_summary_path
+
+    if not LRG_REFSEQGENE_PATH:
+        LRG_REFSEQGENE_PATH = d._lrg_refseqgene_path
+MANE_SUMMARY_PATH = Path(MANE_SUMMARY_PATH)
+LRG_REFSEQGENE_PATH = Path(LRG_REFSEQGENE_PATH)
+
 
 from cool_seq_tool.cool_seq_tool import CoolSeqTool  # noqa: E402, F401, I202
diff --git a/cool_seq_tool/cool_seq_tool.py b/cool_seq_tool/cool_seq_tool.py
@@ -3,6 +3,8 @@
 from typing import Optional, Union, List, Tuple, Dict
 from pathlib import Path
 
+from gene.query import QueryHandler as GeneQueryHandler
+
 from cool_seq_tool import logger
 from cool_seq_tool.data_sources.alignment_mapper import AlignmentMapper
 from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \
@@ -18,24 +20,31 @@
 class CoolSeqTool:
     """Class to initialize data sources."""
 
-    def __init__(self, seqrepo_data_path: str = SEQREPO_DATA_PATH,
-                 transcript_file_path: str = TRANSCRIPT_MAPPINGS_PATH,
-                 lrg_refseqgene_path: str = LRG_REFSEQGENE_PATH,
-                 mane_data_path: str = MANE_SUMMARY_PATH,
-                 db_url: str = UTA_DB_URL, db_pwd: str = "",
-                 gene_db_url: str = "", gene_db_region: str = "us-east-2"
-                 ) -> None:
+    def __init__(
+        self, seqrepo_data_path: Path = SEQREPO_DATA_PATH,
+        transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
+        lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH,
+        mane_data_path: Path = MANE_SUMMARY_PATH,
+        db_url: str = UTA_DB_URL, db_pwd: str = "",
+        gene_query_handler: GeneQueryHandler = None,
+        gene_db_url: str = "", gene_db_region: str = "us-east-2"
+    ) -> None:
         """Initialize CoolSeqTool class
 
-        :param str seqrepo_data_path: The path to the seqrepo directory.
-        :param str transcript_file_path: The path to transcript_mappings.tsv
-        :param str lrg_refseqgene_path: The path to LRG_RefSeqGene
-        :param str mane_data_path: Path to RefSeq MANE summary data
+        :param Path seqrepo_data_path: The path to the seqrepo directory.
+        :param Path transcript_file_path: The path to transcript_mappings.tsv
+        :param Path lrg_refseqgene_path: The path to LRG_RefSeqGene
+        :param Path mane_data_path: Path to RefSeq MANE summary data
         :param str db_url: PostgreSQL connection URL
             Format: `driver://user:pass@host/database/schema`
         :param str db_pwd: User's password for uta database
-        :param str gene_db_url: URL to gene normalizer dynamodb
-        :param str gene_db_region: AWS region for gene normalizer db
+        :param GeneQueryHandler gene_query_handler: Gene normalizer query handler
+            instance. If this is provided, will use a current instance. If this is not
+            provided, will create a new instance.
+        :param str gene_db_url: URL to gene normalizer dynamodb. Only used when
+            `gene_query_handler` is `None`.
+        :param str gene_db_region: AWS region for gene normalizer db. Only used when
+            `gene_query_handler` is `None`.
         """
         self.seqrepo_access = SeqRepoAccess(
             seqrepo_data_path=seqrepo_data_path)
@@ -45,7 +54,8 @@ def __init__(self, seqrepo_data_path: str = SEQREPO_DATA_PATH,
         self.mane_transcript_mappings = MANETranscriptMappings(
             mane_data_path=mane_data_path)
         self.uta_db = UTADatabase(db_url=db_url, db_pwd=db_pwd)
-        gene_normalizer = GeneNormalizer(gene_db_url, gene_db_region)
+        gene_normalizer = GeneNormalizer(gene_query_handler, gene_db_url,
+                                         gene_db_region)
         self.alignment_mapper = AlignmentMapper(
             self.seqrepo_access, self.transcript_mappings, self.uta_db)
         self.mane_transcript = MANETranscript(
@@ -471,7 +481,7 @@ async def _set_genomic_data(self, params: Dict, strand: int,
         if not grch38_ac:
             return f"Invalid genomic accession: {params['chr']}"
 
-        grch38_ac = grch38_ac[0][0]
+        grch38_ac = grch38_ac[0]
         if grch38_ac != params["chr"]:  # params["chr"] is genomic accession
             # Liftover to 38
             descr = await self.uta_db.get_chr_assembly(params["chr"])

diff --git a/cool_seq_tool/data_sources/gene_normalizer.py b/cool_seq_tool/data_sources/gene_normalizer.py
@@ -10,13 +10,24 @@
 class GeneNormalizer:
     """Gene Normalizer class for getting gene data"""
 
-    def __init__(self, db_url: str = "", db_region: str = "us-east-2") -> None:
+    def __init__(
+        self, query_handler: QueryHandler = None, db_url: str = "",
+        db_region: str = "us-east-2"
+    ) -> None:
         """Initialize gene normalizer class
 
-        :param str db_url: URL to gene normalizer dynamodb
-        :param str db_region: AWS region for gene normalizer db
+        :param QueryHandler query_handler: Gene normalizer query handler instance.
+            If this is provided, will use a current instance. If this is not provided,
+            will create a new instance.
+        :param str db_url: URL to gene normalizer dynamodb. Only used when
+            `query_handler` is `None`.
+        :param str db_region: AWS region for gene normalizer db. Only used when
+            `query_handler` is `None`.
         """
-        self.query_handler = QueryHandler(db_url, db_region)
+        if query_handler:
+            self.query_handler = query_handler
+        else:
+            self.query_handler = QueryHandler(db_url, db_region)
 
     def get_hgnc_data(self, gene: str) -> Dict:
         """Return HGNC data for a given gene

diff --git a/cool_seq_tool/data_sources/mane_transcript.py b/cool_seq_tool/data_sources/mane_transcript.py
@@ -736,7 +736,7 @@ async def g_to_grch38(self, ac: str, start_pos: int,
 
         newest_ac = await self.uta_db.get_newest_assembly_ac(ac)
         if newest_ac:
-            ac = newest_ac[0][0]
+            ac = newest_ac[0]
             if self._validate_index(ac, (start_pos, end_pos), 0):
                 return dict(
                     ac=ac,

diff --git a/cool_seq_tool/data_sources/mane_transcript_mappings.py b/cool_seq_tool/data_sources/mane_transcript_mappings.py
@@ -1,4 +1,5 @@
 """The module for loading MANE Transcript mappings to genes."""
+from pathlib import Path
 from typing import Dict, Optional, List
 
 import pandas as pd
@@ -9,9 +10,9 @@
 class MANETranscriptMappings:
     """The MANE Transcript mappings class."""
 
-    def __init__(self, mane_data_path: str = MANE_SUMMARY_PATH) -> None:
+    def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None:
         """Initialize the MANE Transcript mappings class.
-        :param str mane_data_path: Path to RefSeq MANE summary data
+        :param Path mane_data_path: Path to RefSeq MANE summary data
         """
         self.mane_data_path = mane_data_path
         self.df = self._load_mane_transcript_data()

diff --git a/cool_seq_tool/data_sources/seqrepo_access.py b/cool_seq_tool/data_sources/seqrepo_access.py
@@ -1,6 +1,7 @@
 """A module for accessing SeqRepo."""
 from typing import Optional, List, Tuple, Union
 from os import environ
+from pathlib import Path
 
 from biocommons.seqrepo import SeqRepo
 
@@ -12,9 +13,9 @@
 class SeqRepoAccess:
     """The SeqRepoAccess class."""
 
-    def __init__(self, seqrepo_data_path: str = SEQREPO_DATA_PATH) -> None:
+    def __init__(self, seqrepo_data_path: Path = SEQREPO_DATA_PATH) -> None:
         """Initialize the SeqRepoAccess class.
-        :param str seqrepo_data_path: The path to the seqrepo directory.
+        :param Path seqrepo_data_path: The path to the seqrepo directory.
         """
         environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none"
         self.seqrepo_client = SeqRepo(seqrepo_data_path)

diff --git a/cool_seq_tool/data_sources/transcript_mappings.py b/cool_seq_tool/data_sources/transcript_mappings.py
@@ -1,5 +1,6 @@
 """The module for Transcript Mappings."""
 import csv
+from pathlib import Path
 from typing import Dict, List, Optional
 
 from cool_seq_tool import TRANSCRIPT_MAPPINGS_PATH, LRG_REFSEQGENE_PATH
@@ -8,12 +9,12 @@
 class TranscriptMappings:
     """The transcript mappings class."""
 
-    def __init__(self, transcript_file_path: str = TRANSCRIPT_MAPPINGS_PATH,
-                 lrg_refseqgene_path: str = LRG_REFSEQGENE_PATH) -> None:
+    def __init__(self, transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
+                 lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH) -> None:
         """Initialize the transcript mappings class.
 
-        :param str transcript_file_path: Path to transcript mappings file
-        :param str lrg_refseqgene_path: Path to LRG RefSeqGene file
+        :param Path transcript_file_path: Path to transcript mappings file
+        :param Path lrg_refseqgene_path: Path to LRG RefSeqGene file
         """
         # ENSP <-> Gene Symbol
         self.ensembl_protein_version_for_gene_symbol: Dict[str, List[str]] = {}
@@ -51,11 +52,10 @@ def __init__(self, transcript_file_path: str = TRANSCRIPT_MAPPINGS_PATH,
         self._load_transcript_mappings_data(transcript_file_path)
         self._load_refseq_gene_symbol_data(lrg_refseqgene_path)
 
-    def _load_transcript_mappings_data(self,
-                                       transcript_file_path: str) -> None:
+    def _load_transcript_mappings_data(self, transcript_file_path: Path) -> None:
         """Load transcript mappings file to dictionaries.
 
-        :param str transcript_file_path: Path to transcript mappings file
+        :param Path transcript_file_path: Path to transcript mappings file
         """
         with open(transcript_file_path) as file:
             reader = csv.DictReader(file, delimiter="\t")
@@ -96,10 +96,10 @@ def _load_transcript_mappings_data(self,
                         self.ensp_to_enst[versioned_protein_transcript] = \
                             versioned_transcript
 
-    def _load_refseq_gene_symbol_data(self, lrg_refseqgene_path: str) -> None:
+    def _load_refseq_gene_symbol_data(self, lrg_refseqgene_path: Path) -> None:
         """Load data from RefSeq Gene Symbol file to dictionaries.
 
-        :param str lrg_refseqgene_path: Path to LRG RefSeqGene file
+        :param Path lrg_refseqgene_path: Path to LRG RefSeqGene file
         """
         with open(lrg_refseqgene_path) as file:
             reader = csv.DictReader(file, delimiter="\t")