Merge pull request #142 from GenomicMedLab/staging

Staging
GenomicMedLab · Apr 4, 2023 · c8126d9 · c8126d9
2 parents 294a2e6 + 437a869
commit c8126d9
Show file tree

Hide file tree

Showing 15 changed files with 208 additions and 253 deletions.
diff --git a/Pipfile b/Pipfile
@@ -15,6 +15,7 @@ pydantic = "*"
 fastapi = "*"
 uvicorn = "*"
 gene-normalizer = "*"
+"ga4gh.vrs" = "*"
 
 [dev-packages]
 cool_seq_tool = {editable = true, path = "."}
@@ -25,11 +26,9 @@ flake8-docstrings = "*"
 flake8-annotations = "*"
 flake8-quotes = "*"
 flake8-import-order = "*"
-coverage = "*"
 pytest-cov = "*"
-coveralls = "*"
-jupyterlab = "*"
 pytest-asyncio = "==0.18.3"
 ipython = "*"
+ipykernel = "*"
 psycopg2-binary = "*"
 mock = "*"
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ The **cool-seq-tool** provides:
 ### pip
 
 ```commandline
-pip install cool-seq-tool
+pip install cool-seq-tool[dev,tests]
 ```
 
 ### Development
@@ -30,7 +30,7 @@ Install backend dependencies and enter Pipenv environment:
 
 ```commandline
 pipenv shell
-pipenv lock && pipenv sync
+pipenv update
 pipenv install --dev
 ```
 
@@ -71,7 +71,7 @@ If you do not wish to use the default, you must set the environment variable `UT
 #### SeqRepo
 `cool-seq-tool` relies on [seqrepo](https://github.com/biocommons/biocommons.seqrepo), which you must download yourself.
 
-Use the `SEQREPO_DATA_PATH` environment variable to set the path of an already existing SeqRepo directory. The default is `/usr/local/share/seqrepo/latest`.
+Use the `SEQREPO_ROOT_DIR` environment variable to set the path of an already existing SeqRepo directory. The default is `/usr/local/share/seqrepo/latest`.
 
 From the _root_ directory:
 ```

diff --git a/cool_seq_tool/__init__.py b/cool_seq_tool/__init__.py
@@ -16,8 +16,7 @@
 
 UTA_DB_URL = environ.get("UTA_DB_URL",
                          "postgresql://uta_admin@localhost:5433/uta/uta_20210129")
-SEQREPO_DATA_PATH = Path(environ.get("SEQREPO_DATA_PATH",
-                                     "/usr/local/share/seqrepo/latest"))
+SEQREPO_ROOT_DIR = environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest")
 TRANSCRIPT_MAPPINGS_PATH = Path(environ.get("TRANSCRIPT_MAPPINGS_PATH",
                                             f"{APP_ROOT}/data/transcript_mapping.tsv"))
 

diff --git a/cool_seq_tool/cool_seq_tool.py b/cool_seq_tool/cool_seq_tool.py
@@ -3,51 +3,54 @@
 from typing import Optional, Union, List, Tuple, Dict
 from pathlib import Path
 
+from biocommons.seqrepo import SeqRepo
 from gene.query import QueryHandler as GeneQueryHandler
 
-from cool_seq_tool import logger
+from cool_seq_tool import logger, SEQREPO_ROOT_DIR
 from cool_seq_tool.data_sources.alignment_mapper import AlignmentMapper
 from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \
     ResidueMode, GenomicDataResponse, ServiceMeta, TranscriptExonDataResponse
 from cool_seq_tool.data_sources import MANETranscript, MANETranscriptMappings,\
     SeqRepoAccess, TranscriptMappings, UTADatabase, GeneNormalizer
-from cool_seq_tool import SEQREPO_DATA_PATH, \
-    TRANSCRIPT_MAPPINGS_PATH, LRG_REFSEQGENE_PATH, MANE_SUMMARY_PATH, \
-    UTA_DB_URL
+from cool_seq_tool import TRANSCRIPT_MAPPINGS_PATH, LRG_REFSEQGENE_PATH, \
+    MANE_SUMMARY_PATH, UTA_DB_URL
 from cool_seq_tool.version import __version__
 
 
 class CoolSeqTool:
     """Class to initialize data sources."""
 
     def __init__(
-        self, seqrepo_data_path: Path = SEQREPO_DATA_PATH,
+        self,
         transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
         lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH,
         mane_data_path: Path = MANE_SUMMARY_PATH,
         db_url: str = UTA_DB_URL, db_pwd: str = "",
-        gene_query_handler: GeneQueryHandler = None,
-        gene_db_url: str = "", gene_db_region: str = "us-east-2"
+        gene_query_handler: Optional[GeneQueryHandler] = None,
+        gene_db_url: str = "", gene_db_region: str = "us-east-2",
+        sr: Optional[SeqRepo] = None
     ) -> None:
         """Initialize CoolSeqTool class
 
-        :param Path seqrepo_data_path: The path to the seqrepo directory.
         :param Path transcript_file_path: The path to transcript_mappings.tsv
         :param Path lrg_refseqgene_path: The path to LRG_RefSeqGene
         :param Path mane_data_path: Path to RefSeq MANE summary data
         :param str db_url: PostgreSQL connection URL
             Format: `driver://user:pass@host/database/schema`
         :param str db_pwd: User's password for uta database
-        :param GeneQueryHandler gene_query_handler: Gene normalizer query handler
-            instance. If this is provided, will use a current instance. If this is not
-            provided, will create a new instance.
+        :param Optional[GeneQueryHandler] gene_query_handler: Gene normalizer query
+            handler instance. If this is provided, will use a current instance. If this
+            is not provided, will create a new instance.
         :param str gene_db_url: URL to gene normalizer dynamodb. Only used when
             `gene_query_handler` is `None`.
         :param str gene_db_region: AWS region for gene normalizer db. Only used when
             `gene_query_handler` is `None`.
+        :param Optional[SeqRepo] sr: SeqRepo instance. If this is not provided, will
+            create a new instance.
         """
-        self.seqrepo_access = SeqRepoAccess(
-            seqrepo_data_path=seqrepo_data_path)
+        if not sr:
+            sr = SeqRepo(root_dir=SEQREPO_ROOT_DIR)
+        self.seqrepo_access = SeqRepoAccess(sr)
         self.transcript_mappings = TranscriptMappings(
             transcript_file_path=transcript_file_path,
             lrg_refseqgene_path=lrg_refseqgene_path)

diff --git a/cool_seq_tool/data_sources/mane_transcript.py b/cool_seq_tool/data_sources/mane_transcript.py
@@ -10,7 +10,6 @@
 import math
 from typing import Optional, Set, Tuple, Dict, List, Union
 
-import hgvs.parser
 import pandas as pd
 
 from cool_seq_tool.schemas import AnnotationLayer, Assembly, MappedManeData, \
@@ -47,7 +46,6 @@ def __init__(self, seqrepo_access: SeqRepoAccess,
         :param GeneNormalizer gene_normalizer: Access to Gene Normalizer
         """
         self.seqrepo_access = seqrepo_access
-        self.hgvs_parser = hgvs.parser.Parser()
         self.transcript_mappings = transcript_mappings
         self.mane_transcript_mappings = mane_transcript_mappings
         self.uta_db = uta_db

diff --git a/cool_seq_tool/data_sources/seqrepo_access.py b/cool_seq_tool/data_sources/seqrepo_access.py
@@ -1,28 +1,22 @@
 """A module for accessing SeqRepo."""
 from typing import Optional, List, Tuple, Union
 from os import environ
-from pathlib import Path
 
-from biocommons.seqrepo import SeqRepo
+from ga4gh.vrs.dataproxy import SeqRepoDataProxy
 
 from cool_seq_tool.schemas import ResidueMode
-from cool_seq_tool import SEQREPO_DATA_PATH, logger
+from cool_seq_tool import logger
 from cool_seq_tool.data_sources.residue_mode import get_inter_residue_pos
 
 
-class SeqRepoAccess:
+class SeqRepoAccess(SeqRepoDataProxy):
     """The SeqRepoAccess class."""
 
-    def __init__(self, seqrepo_data_path: Path = SEQREPO_DATA_PATH) -> None:
-        """Initialize the SeqRepoAccess class.
-        :param Path seqrepo_data_path: The path to the seqrepo directory.
-        """
-        environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none"
-        self.seqrepo_client = SeqRepo(seqrepo_data_path)
+    environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none"
 
     def get_reference_sequence(
-            self, ac: str, start: Optional[int] = None, end: Optional[int] = None,
-            residue_mode: str = ResidueMode.RESIDUE
+        self, ac: str, start: Optional[int] = None, end: Optional[int] = None,
+        residue_mode: str = ResidueMode.RESIDUE
     ) -> Tuple[str, Optional[str]]:
         """Get reference sequence for an accession given a start and end position.
         If `start` and `end` are not given, it will return the entire reference sequence
@@ -45,7 +39,7 @@ def get_reference_sequence(
                 if start == end:
                     end += 1
         try:
-            sequence = self.seqrepo_client.fetch(ac, start=start, end=end)
+            sequence = self.sr.fetch(ac, start=start, end=end)
         except KeyError:
             msg = f"Accession, {ac}, not found in SeqRepo"
             logger.warning(msg)
@@ -77,7 +71,7 @@ def get_reference_sequence(
             return sequence, None
 
     def translate_identifier(
-            self, ac: str, target_namespace: Optional[Union[str, List[str]]] = None
+        self, ac: str, target_namespaces: Optional[Union[str, List[str]]] = None
     ) -> Tuple[List[str], Optional[str]]:
         """Return list of identifiers for accession.
 
@@ -86,31 +80,32 @@ def translate_identifier(
         :return: List of identifiers, warning
         """
         try:
-            ga4gh_identifiers = self.seqrepo_client.translate_identifier(
-                ac, target_namespaces=target_namespace)
+            ga4gh_identifiers = self.sr.translate_identifier(
+                ac, target_namespaces=target_namespaces)
         except KeyError:
             msg = f"SeqRepo unable to get translated identifiers for {ac}"
             logger.warning(msg)
             return [], msg
         else:
             return ga4gh_identifiers, None
 
-    def aliases(self,
-                input_str: str) -> Tuple[List[Optional[str]], Optional[str]]:
+    def translate_alias(
+        self, input_str: str
+    ) -> Tuple[List[Optional[str]], Optional[str]]:
         """Get aliases for a given input.
 
         :param str input_str: Input to get aliases for
         :return: List of aliases, warning
         """
         try:
-            return self.seqrepo_client.translate_alias(input_str), None
+            return self.sr.translate_alias(input_str), None
         except KeyError:
             msg = f"SeqRepo could not translate alias {input_str}"
             logger.warning(msg)
             return [], msg
 
     def chromosome_to_acs(
-            self, chromosome: str
+        self, chromosome: str
     ) -> Tuple[Optional[List[str]], Optional[str]]:
         """Get accessions for a chromosome
 
@@ -119,8 +114,8 @@ def chromosome_to_acs(
         """
         acs = []
         for assembly in ["GRCh38", "GRCh37"]:
-            tmp_acs = self.translate_identifier(f"{assembly}:chr{chromosome}",
-                                                target_namespace="refseq")[0]
+            tmp_acs, _ = self.translate_identifier(f"{assembly}:chr{chromosome}",
+                                                   target_namespaces="refseq")
             for ac in tmp_acs:
                 acs.append(ac.split("refseq:")[-1])
         if acs:
@@ -134,7 +129,7 @@ def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]:
         :param str ac: Accession
         :return: Chromosome, warning
         """
-        aliases, warning = self.aliases(ac)
+        aliases, _ = self.translate_alias(ac)
         aliases = ([a.split(":")[-1] for a in aliases
                     if a.startswith("GRCh") and "." not in a and "chr" not in a] or [None])[0]  # noqa: E501
         if aliases is None:

diff --git a/cool_seq_tool/data_sources/uta_database.py b/cool_seq_tool/data_sources/uta_database.py
@@ -23,7 +23,6 @@
 
 # Environment variables for paths to chain files for pyliftover
 LIFTOVER_CHAIN_37_TO_38 = environ.get("LIFTOVER_CHAIN_37_TO_38")
-LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37")
 
 
 class UTADatabase:
@@ -33,8 +32,7 @@ def __init__(
         self,
         db_url: str = UTA_DB_URL,
         db_pwd: str = "",
-        chain_file_37_to_38: Optional[str] = None,
-        chain_file_38_to_37: Optional[str] = None
+        chain_file_37_to_38: Optional[str] = None
     ) -> None:
         """Initialize DB class. Downstream libraries should use the create()
         method to construct a new instance: await UTADatabase.create()
@@ -46,10 +44,6 @@ def __init__(
             This is used for pyliftover. If this is not provided, will check to see if
             LIFTOVER_CHAIN_37_TO_38 env var is set. If neither is provided, will allow
             pyliftover to download a chain file from UCSC
-        :param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly.
-            This is used for pyliftover. If this is not provided, will check to see if
-            LIFTOVER_CHAIN_38_TO_37 env var is set. If neither is provided, will allow
-            pyliftover to download a chain file from UCSC
         """
         self.schema = None
         self.db_url = db_url
@@ -63,12 +57,6 @@ def __init__(
         else:
             self.liftover_37_to_38 = LiftOver("hg19", "hg38")
 
-        chain_file_38_to_37 = chain_file_38_to_37 or LIFTOVER_CHAIN_38_TO_37
-        if chain_file_38_to_37:
-            self.liftover_38_to_37 = LiftOver(chain_file_38_to_37)
-        else:
-            self.liftover_38_to_37 = LiftOver("hg38", "hg19")
-
     @staticmethod
     def _update_db_url(db_pwd: str, db_url: str) -> str:
         """Return new db_url containing password.
@@ -1022,8 +1010,6 @@ def get_liftover(self, chromosome: str, pos: int,
 
         if liftover_to_assembly == Assembly.GRCH38:
             liftover = self.liftover_37_to_38.convert_coordinate(chromosome, pos)
-        elif liftover_to_assembly == Assembly.GRCH37:
-            liftover = self.liftover_38_to_37.convert_coordinate(chromosome, pos)
         else:
             logger.warning(f"{liftover_to_assembly} assembly not supported")
             liftover = None

diff --git a/cool_seq_tool/version.py b/cool_seq_tool/version.py
@@ -1 +1 @@
-__version__ = "0.1.9"
+__version__ = "0.1.10"