Skip to content

Commit

Permalink
Merge pull request #1376 from microbiomedata/1337-ingest-pfam-entries
Browse files Browse the repository at this point in the history
Ingest PFAM entries and clans
  • Loading branch information
naglepuff authored Sep 20, 2024
2 parents 7988d1e + 2cffe6b commit 38f62a5
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 1 deletion.
33 changes: 33 additions & 0 deletions nmdc_server/ingest/kegg.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
KoTermText,
KoTermToModule,
KoTermToPathway,
PfamEntryToClan,
)

ORTHOLOGY_URL = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00001&format=json"
Expand All @@ -26,11 +27,14 @@
# Note that we're using the same file for both COG terms and pathways
COG_PATHWAY_DEFS = COG_TERM_DEFS = "/data/ingest/cog/cog-20.def.tab"

PFAM_TERM_DEFS = PFAM_CLAN_DEFS = "/data/ingest/pfam/Pfam-A.clans.tsv"


def load(db: Session) -> None:
ingest_ko_search(db)
ingest_ko_module_map(db)
ingest_ko_pathway_map(db)
ingest_pfam_clan_map(db)


def ingest_ko_search(db: Session) -> None:
Expand Down Expand Up @@ -76,6 +80,13 @@ def get_search_records_from_delimeted_file(
"pubmed_id",
"pdb_id",
]
pfam_headers = [
"pfam_accession",
"clan_accession",
"clan_name",
"pfam_short_name",
"pfam_name",
]

cog_function_headers = ["function_code", "sequence", "definition"]

Expand All @@ -100,6 +111,16 @@ def get_search_records_from_delimeted_file(
"term_key": cog_def_headers[0],
"text_key": cog_def_headers[2],
},
PFAM_TERM_DEFS: {
"fieldnames": pfam_headers,
"term_key": "pfam_accession",
"text_key": "pfam_name",
},
PFAM_CLAN_DEFS: {
"fieldnames": pfam_headers,
"term_key": "clan_accession",
"text_key": "clan_name",
},
}


Expand Down Expand Up @@ -179,3 +200,15 @@ def ingest_ko_pathway_map(db: Session) -> None:
[CogTermToPathway(term=mapping[0], pathway=mapping[1]) for mapping in mappings]
)
db.commit()


def ingest_pfam_clan_map(db: Session) -> None:
"""Ingest a mapping of Pfam entries to clans"""
db.execute(f"truncate table {PfamEntryToClan.__tablename__}")
with open(PFAM_CLAN_DEFS) as fd:
reader = csv.DictReader(fd, fieldnames=pfam_headers, delimiter="\t")
mappings = set([(row[pfam_headers[0]], row[pfam_headers[1]]) for row in reader])
db.bulk_save_objects(
[PfamEntryToClan(entry=mapping[0], clan=mapping[1]) for mapping in mappings]
)
db.commit()
2 changes: 1 addition & 1 deletion nmdc_server/ingest/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

DataObjectList = List[str]
LoadObjectReturn = models.PipelineStep
gene_regex = re.compile(r"^(KEGG\.ORTHOLOGY|COG)")
gene_regex = re.compile(r"^(KEGG\.ORTHOLOGY|COG|PFAM)")


class LoadObject(Protocol):
Expand Down
39 changes: 39 additions & 0 deletions nmdc_server/migrations/versions/5fb9910ca8e6_add_pfam_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Add PFAM mappings
Revision ID: 5fb9910ca8e6
Revises: ff4e651c3007
Create Date: 2024-08-30 21:12:14.993046
"""

from typing import Optional

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision: str = "5fb9910ca8e6"
down_revision: Optional[str] = "ff4e651c3007"
branch_labels: Optional[str] = None
depends_on: Optional[str] = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"pfam_entry_to_clan",
sa.Column("entry", sa.String(), nullable=False),
sa.Column("clan", sa.String(), nullable=False),
sa.PrimaryKeyConstraint("entry", "clan", name=op.f("pk_pfam_entry_to_clan")),
)
op.create_index(
op.f("ix_pfam_entry_to_clan_clan"), "pfam_entry_to_clan", ["clan"], unique=False
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index(op.f("ix_pfam_entry_to_clan_clan"), table_name="pfam_entry_to_clan")
op.drop_table("pfam_entry_to_clan")
# ### end Alembic commands ###
7 changes: 7 additions & 0 deletions nmdc_server/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,13 @@ class CogTermToFunction(Base):
function = Column(String, nullable=False, primary_key=True, index=True)


class PfamEntryToClan(Base):
__tablename__ = "pfam_entry_to_clan"

entry = Column(String, nullable=False, primary_key=True)
clan = Column(String, nullable=False, primary_key=True, index=True)


class KoTermText(Base):
__tablename__ = "ko_term_text"

Expand Down

0 comments on commit 38f62a5

Please sign in to comment.