Skip to content

Commit

Permalink
Merge pull request #142 from GenomicMedLab/staging
Browse files Browse the repository at this point in the history
Staging
  • Loading branch information
korikuzma authored Apr 4, 2023
2 parents 294a2e6 + 437a869 commit c8126d9
Show file tree
Hide file tree
Showing 15 changed files with 208 additions and 253 deletions.
5 changes: 2 additions & 3 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ pydantic = "*"
fastapi = "*"
uvicorn = "*"
gene-normalizer = "*"
"ga4gh.vrs" = "*"

[dev-packages]
cool_seq_tool = {editable = true, path = "."}
Expand All @@ -25,11 +26,9 @@ flake8-docstrings = "*"
flake8-annotations = "*"
flake8-quotes = "*"
flake8-import-order = "*"
coverage = "*"
pytest-cov = "*"
coveralls = "*"
jupyterlab = "*"
pytest-asyncio = "==0.18.3"
ipython = "*"
ipykernel = "*"
psycopg2-binary = "*"
mock = "*"
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The **cool-seq-tool** provides:
### pip

```commandline
pip install cool-seq-tool
pip install cool-seq-tool[dev,tests]
```

### Development
Expand All @@ -30,7 +30,7 @@ Install backend dependencies and enter Pipenv environment:

```commandline
pipenv shell
pipenv lock && pipenv sync
pipenv update
pipenv install --dev
```

Expand Down Expand Up @@ -71,7 +71,7 @@ If you do not wish to use the default, you must set the environment variable `UT
#### SeqRepo
`cool-seq-tool` relies on [seqrepo](https://github.com/biocommons/biocommons.seqrepo), which you must download yourself.
Use the `SEQREPO_DATA_PATH` environment variable to set the path of an already existing SeqRepo directory. The default is `/usr/local/share/seqrepo/latest`.
Use the `SEQREPO_ROOT_DIR` environment variable to set the path of an already existing SeqRepo directory. The default is `/usr/local/share/seqrepo/latest`.
From the _root_ directory:
```
Expand Down
3 changes: 1 addition & 2 deletions cool_seq_tool/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@

UTA_DB_URL = environ.get("UTA_DB_URL",
"postgresql://uta_admin@localhost:5433/uta/uta_20210129")
SEQREPO_DATA_PATH = Path(environ.get("SEQREPO_DATA_PATH",
"/usr/local/share/seqrepo/latest"))
SEQREPO_ROOT_DIR = environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest")
TRANSCRIPT_MAPPINGS_PATH = Path(environ.get("TRANSCRIPT_MAPPINGS_PATH",
f"{APP_ROOT}/data/transcript_mapping.tsv"))

Expand Down
29 changes: 16 additions & 13 deletions cool_seq_tool/cool_seq_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,51 +3,54 @@
from typing import Optional, Union, List, Tuple, Dict
from pathlib import Path

from biocommons.seqrepo import SeqRepo
from gene.query import QueryHandler as GeneQueryHandler

from cool_seq_tool import logger
from cool_seq_tool import logger, SEQREPO_ROOT_DIR
from cool_seq_tool.data_sources.alignment_mapper import AlignmentMapper
from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \
ResidueMode, GenomicDataResponse, ServiceMeta, TranscriptExonDataResponse
from cool_seq_tool.data_sources import MANETranscript, MANETranscriptMappings,\
SeqRepoAccess, TranscriptMappings, UTADatabase, GeneNormalizer
from cool_seq_tool import SEQREPO_DATA_PATH, \
TRANSCRIPT_MAPPINGS_PATH, LRG_REFSEQGENE_PATH, MANE_SUMMARY_PATH, \
UTA_DB_URL
from cool_seq_tool import TRANSCRIPT_MAPPINGS_PATH, LRG_REFSEQGENE_PATH, \
MANE_SUMMARY_PATH, UTA_DB_URL
from cool_seq_tool.version import __version__


class CoolSeqTool:
"""Class to initialize data sources."""

def __init__(
self, seqrepo_data_path: Path = SEQREPO_DATA_PATH,
self,
transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH,
mane_data_path: Path = MANE_SUMMARY_PATH,
db_url: str = UTA_DB_URL, db_pwd: str = "",
gene_query_handler: GeneQueryHandler = None,
gene_db_url: str = "", gene_db_region: str = "us-east-2"
gene_query_handler: Optional[GeneQueryHandler] = None,
gene_db_url: str = "", gene_db_region: str = "us-east-2",
sr: Optional[SeqRepo] = None
) -> None:
"""Initialize CoolSeqTool class
:param Path seqrepo_data_path: The path to the seqrepo directory.
:param Path transcript_file_path: The path to transcript_mappings.tsv
:param Path lrg_refseqgene_path: The path to LRG_RefSeqGene
:param Path mane_data_path: Path to RefSeq MANE summary data
:param str db_url: PostgreSQL connection URL
Format: `driver://user:pass@host/database/schema`
:param str db_pwd: User's password for uta database
:param GeneQueryHandler gene_query_handler: Gene normalizer query handler
instance. If this is provided, will use a current instance. If this is not
provided, will create a new instance.
:param Optional[GeneQueryHandler] gene_query_handler: Gene normalizer query
handler instance. If this is provided, will use a current instance. If this
is not provided, will create a new instance.
:param str gene_db_url: URL to gene normalizer dynamodb. Only used when
`gene_query_handler` is `None`.
:param str gene_db_region: AWS region for gene normalizer db. Only used when
`gene_query_handler` is `None`.
:param Optional[SeqRepo] sr: SeqRepo instance. If this is not provided, will
create a new instance.
"""
self.seqrepo_access = SeqRepoAccess(
seqrepo_data_path=seqrepo_data_path)
if not sr:
sr = SeqRepo(root_dir=SEQREPO_ROOT_DIR)
self.seqrepo_access = SeqRepoAccess(sr)
self.transcript_mappings = TranscriptMappings(
transcript_file_path=transcript_file_path,
lrg_refseqgene_path=lrg_refseqgene_path)
Expand Down
2 changes: 0 additions & 2 deletions cool_seq_tool/data_sources/mane_transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import math
from typing import Optional, Set, Tuple, Dict, List, Union

import hgvs.parser
import pandas as pd

from cool_seq_tool.schemas import AnnotationLayer, Assembly, MappedManeData, \
Expand Down Expand Up @@ -47,7 +46,6 @@ def __init__(self, seqrepo_access: SeqRepoAccess,
:param GeneNormalizer gene_normalizer: Access to Gene Normalizer
"""
self.seqrepo_access = seqrepo_access
self.hgvs_parser = hgvs.parser.Parser()
self.transcript_mappings = transcript_mappings
self.mane_transcript_mappings = mane_transcript_mappings
self.uta_db = uta_db
Expand Down
41 changes: 18 additions & 23 deletions cool_seq_tool/data_sources/seqrepo_access.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,22 @@
"""A module for accessing SeqRepo."""
from typing import Optional, List, Tuple, Union
from os import environ
from pathlib import Path

from biocommons.seqrepo import SeqRepo
from ga4gh.vrs.dataproxy import SeqRepoDataProxy

from cool_seq_tool.schemas import ResidueMode
from cool_seq_tool import SEQREPO_DATA_PATH, logger
from cool_seq_tool import logger
from cool_seq_tool.data_sources.residue_mode import get_inter_residue_pos


class SeqRepoAccess:
class SeqRepoAccess(SeqRepoDataProxy):
"""The SeqRepoAccess class."""

def __init__(self, seqrepo_data_path: Path = SEQREPO_DATA_PATH) -> None:
"""Initialize the SeqRepoAccess class.
:param Path seqrepo_data_path: The path to the seqrepo directory.
"""
environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none"
self.seqrepo_client = SeqRepo(seqrepo_data_path)
environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none"

def get_reference_sequence(
self, ac: str, start: Optional[int] = None, end: Optional[int] = None,
residue_mode: str = ResidueMode.RESIDUE
self, ac: str, start: Optional[int] = None, end: Optional[int] = None,
residue_mode: str = ResidueMode.RESIDUE
) -> Tuple[str, Optional[str]]:
"""Get reference sequence for an accession given a start and end position.
If `start` and `end` are not given, it will return the entire reference sequence
Expand All @@ -45,7 +39,7 @@ def get_reference_sequence(
if start == end:
end += 1
try:
sequence = self.seqrepo_client.fetch(ac, start=start, end=end)
sequence = self.sr.fetch(ac, start=start, end=end)
except KeyError:
msg = f"Accession, {ac}, not found in SeqRepo"
logger.warning(msg)
Expand Down Expand Up @@ -77,7 +71,7 @@ def get_reference_sequence(
return sequence, None

def translate_identifier(
self, ac: str, target_namespace: Optional[Union[str, List[str]]] = None
self, ac: str, target_namespaces: Optional[Union[str, List[str]]] = None
) -> Tuple[List[str], Optional[str]]:
"""Return list of identifiers for accession.
Expand All @@ -86,31 +80,32 @@ def translate_identifier(
:return: List of identifiers, warning
"""
try:
ga4gh_identifiers = self.seqrepo_client.translate_identifier(
ac, target_namespaces=target_namespace)
ga4gh_identifiers = self.sr.translate_identifier(
ac, target_namespaces=target_namespaces)
except KeyError:
msg = f"SeqRepo unable to get translated identifiers for {ac}"
logger.warning(msg)
return [], msg
else:
return ga4gh_identifiers, None

def aliases(self,
input_str: str) -> Tuple[List[Optional[str]], Optional[str]]:
def translate_alias(
self, input_str: str
) -> Tuple[List[Optional[str]], Optional[str]]:
"""Get aliases for a given input.
:param str input_str: Input to get aliases for
:return: List of aliases, warning
"""
try:
return self.seqrepo_client.translate_alias(input_str), None
return self.sr.translate_alias(input_str), None
except KeyError:
msg = f"SeqRepo could not translate alias {input_str}"
logger.warning(msg)
return [], msg

def chromosome_to_acs(
self, chromosome: str
self, chromosome: str
) -> Tuple[Optional[List[str]], Optional[str]]:
"""Get accessions for a chromosome
Expand All @@ -119,8 +114,8 @@ def chromosome_to_acs(
"""
acs = []
for assembly in ["GRCh38", "GRCh37"]:
tmp_acs = self.translate_identifier(f"{assembly}:chr{chromosome}",
target_namespace="refseq")[0]
tmp_acs, _ = self.translate_identifier(f"{assembly}:chr{chromosome}",
target_namespaces="refseq")
for ac in tmp_acs:
acs.append(ac.split("refseq:")[-1])
if acs:
Expand All @@ -134,7 +129,7 @@ def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]:
:param str ac: Accession
:return: Chromosome, warning
"""
aliases, warning = self.aliases(ac)
aliases, _ = self.translate_alias(ac)
aliases = ([a.split(":")[-1] for a in aliases
if a.startswith("GRCh") and "." not in a and "chr" not in a] or [None])[0] # noqa: E501
if aliases is None:
Expand Down
16 changes: 1 addition & 15 deletions cool_seq_tool/data_sources/uta_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

# Environment variables for paths to chain files for pyliftover
LIFTOVER_CHAIN_37_TO_38 = environ.get("LIFTOVER_CHAIN_37_TO_38")
LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37")


class UTADatabase:
Expand All @@ -33,8 +32,7 @@ def __init__(
self,
db_url: str = UTA_DB_URL,
db_pwd: str = "",
chain_file_37_to_38: Optional[str] = None,
chain_file_38_to_37: Optional[str] = None
chain_file_37_to_38: Optional[str] = None
) -> None:
"""Initialize DB class. Downstream libraries should use the create()
method to construct a new instance: await UTADatabase.create()
Expand All @@ -46,10 +44,6 @@ def __init__(
This is used for pyliftover. If this is not provided, will check to see if
LIFTOVER_CHAIN_37_TO_38 env var is set. If neither is provided, will allow
pyliftover to download a chain file from UCSC
:param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly.
This is used for pyliftover. If this is not provided, will check to see if
LIFTOVER_CHAIN_38_TO_37 env var is set. If neither is provided, will allow
pyliftover to download a chain file from UCSC
"""
self.schema = None
self.db_url = db_url
Expand All @@ -63,12 +57,6 @@ def __init__(
else:
self.liftover_37_to_38 = LiftOver("hg19", "hg38")

chain_file_38_to_37 = chain_file_38_to_37 or LIFTOVER_CHAIN_38_TO_37
if chain_file_38_to_37:
self.liftover_38_to_37 = LiftOver(chain_file_38_to_37)
else:
self.liftover_38_to_37 = LiftOver("hg38", "hg19")

@staticmethod
def _update_db_url(db_pwd: str, db_url: str) -> str:
"""Return new db_url containing password.
Expand Down Expand Up @@ -1022,8 +1010,6 @@ def get_liftover(self, chromosome: str, pos: int,

if liftover_to_assembly == Assembly.GRCH38:
liftover = self.liftover_37_to_38.convert_coordinate(chromosome, pos)
elif liftover_to_assembly == Assembly.GRCH37:
liftover = self.liftover_38_to_37.convert_coordinate(chromosome, pos)
else:
logger.warning(f"{liftover_to_assembly} assembly not supported")
liftover = None
Expand Down
2 changes: 1 addition & 1 deletion cool_seq_tool/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.9"
__version__ = "0.1.10"
Loading

0 comments on commit c8126d9

Please sign in to comment.