Skip to content

Commit

Permalink
Merge pull request #134 from GenomicMedLab/staging
Browse files Browse the repository at this point in the history
Staging
  • Loading branch information
korikuzma authored Mar 22, 2023
2 parents 4456bfd + b375a20 commit 66ba20a
Show file tree
Hide file tree
Showing 16 changed files with 589 additions and 446 deletions.
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ If you do not wish to use the default, you must set the environment variable `UT
#### SeqRepo
`cool-seq-tool` relies on [seqrepo](https://github.com/biocommons/biocommons.seqrepo), which you must download yourself.
Use the `SEQREPO_DATA_PATH` environment variable to set the path of an already existing SeqRepo directory. The default is `/usr/local/share/seqrepo/latest`.
From the _root_ directory:
```
pip install seqrepo
Expand All @@ -96,13 +98,15 @@ exit

![image](biomart.png)

Use the `TRANSCRIPT_MAPPINGS_PATH` environment variable to set the path of an already existing `transcript_mappings.tsv`. The default is `cool_seq_tool/data/transcript_mapping.tsv`.

#### LRG_RefSeqGene

`cool-seq-tool` fetches the latest version of `LRG_RefSeqGene`. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene).
`cool-seq-tool` fetches the latest version of `LRG_RefSeqGene` if the environment variable `LRG_REFSEQGENE_PATH` is not set. When `LRG_REFSEQGENE_PATH` is set, `cool-seq-tool` will look at this path and expect the LRG_RefSeqGene file. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene).

#### MANE Summary Data

`cool-seq-tool` fetches the latest version of `MANE.GRCh38.*.summary.txt.gz`. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/).
`cool-seq-tool` fetches the latest version of `MANE.GRCh38.*.summary.txt.gz` if the environment variable `MANE_SUMMARY_PATH` is not set. When `MANE_SUMMARY_PATH` is set, `cool-seq-tool` will look at this path and expect the MANE Summary Data file. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/).

## Starting the UTA Tools Service Locally

Expand Down
38 changes: 22 additions & 16 deletions cool_seq_tool/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,27 @@

LOG_FN = "cool_seq_tool.log"

if "UTA_DB_URL" in environ:
UTA_DB_URL = environ["UTA_DB_URL"]
else:
UTA_DB_URL = "postgresql://uta_admin@localhost:5433/uta/uta_20210129"

if "SEQREPO_DATA_PATH" in environ:
SEQREPO_DATA_PATH = environ["SEQREPO_DATA_PATH"]
else:
SEQREPO_DATA_PATH = "/usr/local/share/seqrepo/latest"

TRANSCRIPT_MAPPINGS_PATH = f"{APP_ROOT}/data/transcript_mapping.tsv"

from cool_seq_tool.data import DataDownload # noqa: E402, I202
d = DataDownload()
MANE_SUMMARY_PATH = d._mane_summary_path
LRG_REFSEQGENE_PATH = d._lrg_refseqgene_path
UTA_DB_URL = environ.get("UTA_DB_URL",
"postgresql://uta_admin@localhost:5433/uta/uta_20210129")
SEQREPO_DATA_PATH = Path(environ.get("SEQREPO_DATA_PATH",
"/usr/local/share/seqrepo/latest"))
TRANSCRIPT_MAPPINGS_PATH = Path(environ.get("TRANSCRIPT_MAPPINGS_PATH",
f"{APP_ROOT}/data/transcript_mapping.tsv"))


MANE_SUMMARY_PATH = environ.get("MANE_SUMMARY_PATH")
LRG_REFSEQGENE_PATH = environ.get("LRG_REFSEQGENE_PATH")
if not all((MANE_SUMMARY_PATH, LRG_REFSEQGENE_PATH)):
from cool_seq_tool.data import DataDownload # noqa: E402, I202
d = DataDownload()

if not MANE_SUMMARY_PATH:
MANE_SUMMARY_PATH = d._mane_summary_path

if not LRG_REFSEQGENE_PATH:
LRG_REFSEQGENE_PATH = d._lrg_refseqgene_path
MANE_SUMMARY_PATH = Path(MANE_SUMMARY_PATH)
LRG_REFSEQGENE_PATH = Path(LRG_REFSEQGENE_PATH)


from cool_seq_tool.cool_seq_tool import CoolSeqTool # noqa: E402, F401, I202
40 changes: 25 additions & 15 deletions cool_seq_tool/cool_seq_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from typing import Optional, Union, List, Tuple, Dict
from pathlib import Path

from gene.query import QueryHandler as GeneQueryHandler

from cool_seq_tool import logger
from cool_seq_tool.data_sources.alignment_mapper import AlignmentMapper
from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \
Expand All @@ -18,24 +20,31 @@
class CoolSeqTool:
"""Class to initialize data sources."""

def __init__(self, seqrepo_data_path: str = SEQREPO_DATA_PATH,
transcript_file_path: str = TRANSCRIPT_MAPPINGS_PATH,
lrg_refseqgene_path: str = LRG_REFSEQGENE_PATH,
mane_data_path: str = MANE_SUMMARY_PATH,
db_url: str = UTA_DB_URL, db_pwd: str = "",
gene_db_url: str = "", gene_db_region: str = "us-east-2"
) -> None:
def __init__(
self, seqrepo_data_path: Path = SEQREPO_DATA_PATH,
transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH,
mane_data_path: Path = MANE_SUMMARY_PATH,
db_url: str = UTA_DB_URL, db_pwd: str = "",
gene_query_handler: GeneQueryHandler = None,
gene_db_url: str = "", gene_db_region: str = "us-east-2"
) -> None:
"""Initialize CoolSeqTool class
:param str seqrepo_data_path: The path to the seqrepo directory.
:param str transcript_file_path: The path to transcript_mappings.tsv
:param str lrg_refseqgene_path: The path to LRG_RefSeqGene
:param str mane_data_path: Path to RefSeq MANE summary data
:param Path seqrepo_data_path: The path to the seqrepo directory.
:param Path transcript_file_path: The path to transcript_mappings.tsv
:param Path lrg_refseqgene_path: The path to LRG_RefSeqGene
:param Path mane_data_path: Path to RefSeq MANE summary data
:param str db_url: PostgreSQL connection URL
Format: `driver://user:pass@host/database/schema`
:param str db_pwd: User's password for uta database
:param str gene_db_url: URL to gene normalizer dynamodb
:param str gene_db_region: AWS region for gene normalizer db
:param GeneQueryHandler gene_query_handler: Gene normalizer query handler
instance. If this is provided, will use a current instance. If this is not
provided, will create a new instance.
:param str gene_db_url: URL to gene normalizer dynamodb. Only used when
`gene_query_handler` is `None`.
:param str gene_db_region: AWS region for gene normalizer db. Only used when
`gene_query_handler` is `None`.
"""
self.seqrepo_access = SeqRepoAccess(
seqrepo_data_path=seqrepo_data_path)
Expand All @@ -45,7 +54,8 @@ def __init__(self, seqrepo_data_path: str = SEQREPO_DATA_PATH,
self.mane_transcript_mappings = MANETranscriptMappings(
mane_data_path=mane_data_path)
self.uta_db = UTADatabase(db_url=db_url, db_pwd=db_pwd)
gene_normalizer = GeneNormalizer(gene_db_url, gene_db_region)
gene_normalizer = GeneNormalizer(gene_query_handler, gene_db_url,
gene_db_region)
self.alignment_mapper = AlignmentMapper(
self.seqrepo_access, self.transcript_mappings, self.uta_db)
self.mane_transcript = MANETranscript(
Expand Down Expand Up @@ -471,7 +481,7 @@ async def _set_genomic_data(self, params: Dict, strand: int,
if not grch38_ac:
return f"Invalid genomic accession: {params['chr']}"

grch38_ac = grch38_ac[0][0]
grch38_ac = grch38_ac[0]
if grch38_ac != params["chr"]: # params["chr"] is genomic accession
# Liftover to 38
descr = await self.uta_db.get_chr_assembly(params["chr"])
Expand Down
19 changes: 15 additions & 4 deletions cool_seq_tool/data_sources/gene_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,24 @@
class GeneNormalizer:
"""Gene Normalizer class for getting gene data"""

def __init__(self, db_url: str = "", db_region: str = "us-east-2") -> None:
def __init__(
self, query_handler: QueryHandler = None, db_url: str = "",
db_region: str = "us-east-2"
) -> None:
"""Initialize gene normalizer class
:param str db_url: URL to gene normalizer dynamodb
:param str db_region: AWS region for gene normalizer db
:param QueryHandler query_handler: Gene normalizer query handler instance.
If this is provided, will use a current instance. If this is not provided,
will create a new instance.
:param str db_url: URL to gene normalizer dynamodb. Only used when
`query_handler` is `None`.
:param str db_region: AWS region for gene normalizer db. Only used when
`query_handler` is `None`.
"""
self.query_handler = QueryHandler(db_url, db_region)
if query_handler:
self.query_handler = query_handler
else:
self.query_handler = QueryHandler(db_url, db_region)

def get_hgnc_data(self, gene: str) -> Dict:
"""Return HGNC data for a given gene
Expand Down
2 changes: 1 addition & 1 deletion cool_seq_tool/data_sources/mane_transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@ async def g_to_grch38(self, ac: str, start_pos: int,

newest_ac = await self.uta_db.get_newest_assembly_ac(ac)
if newest_ac:
ac = newest_ac[0][0]
ac = newest_ac[0]
if self._validate_index(ac, (start_pos, end_pos), 0):
return dict(
ac=ac,
Expand Down
5 changes: 3 additions & 2 deletions cool_seq_tool/data_sources/mane_transcript_mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""The module for loading MANE Transcript mappings to genes."""
from pathlib import Path
from typing import Dict, Optional, List

import pandas as pd
Expand All @@ -9,9 +10,9 @@
class MANETranscriptMappings:
"""The MANE Transcript mappings class."""

def __init__(self, mane_data_path: str = MANE_SUMMARY_PATH) -> None:
def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None:
"""Initialize the MANE Transcript mappings class.
:param str mane_data_path: Path to RefSeq MANE summary data
:param Path mane_data_path: Path to RefSeq MANE summary data
"""
self.mane_data_path = mane_data_path
self.df = self._load_mane_transcript_data()
Expand Down
5 changes: 3 additions & 2 deletions cool_seq_tool/data_sources/seqrepo_access.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""A module for accessing SeqRepo."""
from typing import Optional, List, Tuple, Union
from os import environ
from pathlib import Path

from biocommons.seqrepo import SeqRepo

Expand All @@ -12,9 +13,9 @@
class SeqRepoAccess:
"""The SeqRepoAccess class."""

def __init__(self, seqrepo_data_path: str = SEQREPO_DATA_PATH) -> None:
def __init__(self, seqrepo_data_path: Path = SEQREPO_DATA_PATH) -> None:
"""Initialize the SeqRepoAccess class.
:param str seqrepo_data_path: The path to the seqrepo directory.
:param Path seqrepo_data_path: The path to the seqrepo directory.
"""
environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none"
self.seqrepo_client = SeqRepo(seqrepo_data_path)
Expand Down
18 changes: 9 additions & 9 deletions cool_seq_tool/data_sources/transcript_mappings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""The module for Transcript Mappings."""
import csv
from pathlib import Path
from typing import Dict, List, Optional

from cool_seq_tool import TRANSCRIPT_MAPPINGS_PATH, LRG_REFSEQGENE_PATH
Expand All @@ -8,12 +9,12 @@
class TranscriptMappings:
"""The transcript mappings class."""

def __init__(self, transcript_file_path: str = TRANSCRIPT_MAPPINGS_PATH,
lrg_refseqgene_path: str = LRG_REFSEQGENE_PATH) -> None:
def __init__(self, transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH) -> None:
"""Initialize the transcript mappings class.
:param str transcript_file_path: Path to transcript mappings file
:param str lrg_refseqgene_path: Path to LRG RefSeqGene file
:param Path transcript_file_path: Path to transcript mappings file
:param Path lrg_refseqgene_path: Path to LRG RefSeqGene file
"""
# ENSP <-> Gene Symbol
self.ensembl_protein_version_for_gene_symbol: Dict[str, List[str]] = {}
Expand Down Expand Up @@ -51,11 +52,10 @@ def __init__(self, transcript_file_path: str = TRANSCRIPT_MAPPINGS_PATH,
self._load_transcript_mappings_data(transcript_file_path)
self._load_refseq_gene_symbol_data(lrg_refseqgene_path)

def _load_transcript_mappings_data(self,
transcript_file_path: str) -> None:
def _load_transcript_mappings_data(self, transcript_file_path: Path) -> None:
"""Load transcript mappings file to dictionaries.
:param str transcript_file_path: Path to transcript mappings file
:param Path transcript_file_path: Path to transcript mappings file
"""
with open(transcript_file_path) as file:
reader = csv.DictReader(file, delimiter="\t")
Expand Down Expand Up @@ -96,10 +96,10 @@ def _load_transcript_mappings_data(self,
self.ensp_to_enst[versioned_protein_transcript] = \
versioned_transcript

def _load_refseq_gene_symbol_data(self, lrg_refseqgene_path: str) -> None:
def _load_refseq_gene_symbol_data(self, lrg_refseqgene_path: Path) -> None:
"""Load data from RefSeq Gene Symbol file to dictionaries.
:param str lrg_refseqgene_path: Path to LRG RefSeqGene file
:param Path lrg_refseqgene_path: Path to LRG RefSeqGene file
"""
with open(lrg_refseqgene_path) as file:
reader = csv.DictReader(file, delimiter="\t")
Expand Down
Loading

0 comments on commit 66ba20a

Please sign in to comment.