Skip to content

Commit

Permalink
Merge pull request #10 from cmatKhan/v1.1.0
Browse files Browse the repository at this point in the history
V1.1.0
  • Loading branch information
cmatKhan authored Oct 24, 2023
2 parents 1a6e147 + 4f3e0cf commit 613e6ef
Show file tree
Hide file tree
Showing 30 changed files with 287 additions and 169 deletions.
5 changes: 1 addition & 4 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@ jobs:
build:
runs-on: ubuntu-latest
name: Test CallingCardsTools
defaults:
run:
working-directory: ./CallingCardsTools
steps:
- uses: actions/checkout@v1
- uses: actions/setup-python@v2
Expand All @@ -20,6 +17,6 @@ jobs:
pip install poetry
poetry install
- name: Run tests and collect coverage
run: pytest --cov=./ --cov-report=xml
run: poetry run pytest --cov=./ --cov-report=xml
- name: Upload coverage reports to Codecov with GitHub Action
uses: codecov/codecov-action@v3
3 changes: 0 additions & 3 deletions .github/workflows/pyright.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,5 @@ jobs:
build:
runs-on: ubuntu-latest
name: Pyright Type Checking
defaults:
run:
working-directory: ./CallingCardsTools
steps:
- uses: jakebailey/pyright-action@v1
37 changes: 21 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,49 +1,54 @@
# Introduction
# callingCardsTools

## Introduction

`CallingCardsTools` Provides both an API and a number of cmd line tools
for processing raw Calling Cards data. This is used in the
[nf-core/callingcards](https://github.com/nf-core/callingcards) pipeline,
which provides a workflow to process both yeast and mammals Calling Cards data.

# Documentation
## Documentation

[Served Documentation](https://cmatkhan.github.io/callingCardsTools/) provides
information on filetypes and the API. For help with the cmd line tools,
simply install callingcardstools (see below) and do:

```
```bash
callingcardstools --help
```

Each of the cmd line tools also provides a `--help` message.

# Installation
## Installation

```
```bash
pip install callingcardstools
```

To start using the command line tools, see the help message with:

```
```bash
callingcardstools --help
```

Callingcardstools is containerized:

```
docker pull cmatkhan/callingcardstools
```
- A singularity container is hosted on
[Galaxyhub](https://depot.galaxyproject.org/singularity/). If you go to this
site, make sure the 'c's have loaded and then search for 'callingcardstools'.
There is a container for each version which is on bioconda. Make sure you get
the correct version.

```
singularity pull cmatkhan/callingcardstools
```
- A docker container is hosted on
[Dockerhub](https://quay.io/repository/biocontainers/callingcardstools).
Again, make sure you get the correct version.

# Development Installation
## Development Installation

1. install [poetry](https://python-poetry.org/)
- I prefer to set the default location of the virtual environment to the
project directory. You can set that as a global configuration for your

- I prefer to set the default location of the virtual environment to the
project directory. You can set that as a global configuration for your
poetry installation like so: `poetry config virtualenvs.in-project true`

2. git clone the repo
Expand All @@ -54,7 +59,7 @@ singularity pull cmatkhan/callingcardstools

5. build the package with `poetry build`

6. install the callingcardstools package into your virtual environment
6. install the callingcardstools package into your virtual environment
`pip install dist/callingcardstools-...`
- Note: you could figure out how to use the pip install `-e` flag to
have an interactive development environment. I don't think that is compatible
Expand Down
109 changes: 60 additions & 49 deletions callingcardstools/Alignment/mammals/Qbed.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Iterable, DefaultDict
from functools import partial
import csv
import re
# outside dependencies
import pandas as pd

Expand Down Expand Up @@ -73,17 +74,27 @@ def __init__(self, data_type=int):


class Qbed():
"""An object to write records from a tagged_read_dict to qbed file and
qc files.
"""

An object to write records from a tagged_read_dict to qbed file and
qc files. See https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8150125/
for more details.
Attributes:
qbed_fields (list): List of strings. Values in list are column names
for qbed file.
qbed (OuterDefaultDict): A nested defaultdict object. The keys are the
qbed_fields. The values are the counts of each record.
status_dict (DefaultDict): A defaultdict object. The keys are the
status flags. The values are the counts of each status flag.
"""
_qbed_fields: list
_qbed: DefaultDict
_status_dict: DefaultDict

def __init__(self, pickle_path: str = None) -> None:
"""Create a ReadRecords object. This object will write records to
a qbed file and a qc file.
"""
Create a ReadRecords object. This object will write records to
a qbed file and a qc file.
Args:
pickle_path: Path to a pickle file to load. If None, then
Expand All @@ -96,7 +107,8 @@ def __init__(self, pickle_path: str = None) -> None:
self.load(pickle_path)
else:
# set qbed fields
self.qbed_fields = ['chr', 'start', 'end', 'strand', 'depth']
self.qbed_fields = ['chr', 'start', 'end', 'depth',
'strand', 'annotation']
self.qbed = OuterDefaultDict(int)
self.status_dict = DefaultDict(int)

Expand All @@ -108,8 +120,8 @@ def qbed_fields(self):
@qbed_fields.setter
def qbed_fields(self, value: list):
"""Set the qbed fields"""
if not len(value) == 5:
raise ValueError('qbed_fields must have 5 values')
if not len(value) == 6:
raise ValueError('qbed_fields must have 6 values')
self._qbed_fields = value

@property
Expand Down Expand Up @@ -150,12 +162,12 @@ def _combine(self, other):
for start, value2 in value1.items():
for end, value3 in value2.items():
for strand, value4 in value3.items():
for annotation_str, count in value4.items():
for srt_seq, count in value4.items():
(self.qbed[chr]
[start]
[end]
[strand]
[annotation_str]) += count
[srt_seq]) += count

# Combine status_dict property
for status, count in other.status_dict.items():
Expand All @@ -169,8 +181,8 @@ def _srt_writer(self, output_path: str,
fieldnames = ['srt_type', 'count']

# Create a DictWriter instance with a tab delimiter.
writer = csv.DictWriter(tsvfile,
fieldnames=fieldnames,
writer = csv.DictWriter(tsvfile,
fieldnames=fieldnames,
delimiter='\t')

# Write
Expand All @@ -181,7 +193,7 @@ def _srt_writer(self, output_path: str,
'count': multi_srt_count})

# public methods ----------------------------------------------------------

def load(self, file_path: str) -> None:
"""Load a BarcodeQcCounter object from a file using Pickle.
Expand Down Expand Up @@ -229,45 +241,45 @@ def __add__(self, other: "Qbed") -> "Qbed":
def update(self,
tagged_read: dict,
status: int,
insert_offset=1,
annotation_tags: list = None) -> None:
insert_offset: int = 1,
srt_tag: str = 'ST') -> None:
"""write records to both the raw qbed tmpfile and raw qc tmpfile.
Note that these tempfiles will be destroyed when the object is
destroyed.
Args:
tagged_read (dict): A pysam.AlignedSegment object which has been
tagged with the appropriate calling cards tags based on the
tagged_read (dict): A pysam.AlignedSegment object which has been
tagged with the appropriate calling cards tags based on the
BarcodeParser object used to create the object.
status (int): A value which reflects how the read performs
based on pre-defined quality metrics. A status of 0 is considered
a pass. A status of greater than 0 is a read which fails
at least 1 quality metric
based on pre-defined quality metrics. A status of 0 is
considered a pass. A status of greater than 0 is a read which
fails at least 1 quality metric
insert_offset (int): number to add to tag XI value to calculate
the end coordinate. For instance, if the start coord is the first
T in TTAA, then the offset would be 4.
annotation_tags (list): List of strings. Values in list are tags to
extract from tagged_read dictionary. Values of tag will be added
to the annotation column of the qbed as a string delimited by '/'.
the end coordinate. For instance, if the start coord is the
first T in TTAA, then the offset would be 4.
srt_tag (str): The tag which corresponds to the SRT sequence
of a given read. This will be included in the annotation
column of the mammals qbed file.
"""
if len({'read', 'barcode_details'}-tagged_read.keys()) > 0:
raise KeyError('tagged_read must have keys '
'{"reads","barcode_details"}')

if status == 0:
# create the annotation field. If the annotation_tags list is not
# empty, this will try to extract the value in the tag from the
# tagged_read. KeyError is raised if that tag DNE. Empty string
# is created if annotation_tags is empty list
annotation = ("/".join(tagged_read['read'].get_tag(x).split('/')[0]
for x in annotation_tags)
if annotation_tags else '')
# for mammals, the SRT tag is expected. This will raise a KeyError
# if the SRT tag is not present
try:
srt_with_edit_dist = tagged_read['read'].get_tag(srt_tag)
srt = re.sub(r'\/\d+', '', srt_with_edit_dist)
except KeyError as exc:
raise f"tagged_read must have SRT key {srt_tag}" from exc
chr = tagged_read['read'].reference_name
start = tagged_read['read'].get_tag('XI')
end = tagged_read['read'].get_tag('XI') + insert_offset
strand = '+' if tagged_read['read'].is_forward else '-'

self.qbed[chr][start][end][strand][annotation] += 1
self.qbed[chr][start][end][strand][srt] += 1

self.status_dict[status] += 1

Expand All @@ -294,18 +306,17 @@ def write(self,
for start, value2 in value1.items():
for end, value3 in value2.items():
for strand, value4 in value3.items():
hop_count = 0
hop_annotation_set = set()
for annotation_str, count in value4.items():
hop_count += count
hop_annotation_set.add(annotation_str)
# add a hop record to the qbed DataFrame
qbed_df = qbed_df.append(
pd.Series([chr, start, end, strand, hop_count],
index=self.qbed_fields),
ignore_index=True)
locus_srt_set = set()
for srt_seq, count in value4.items():
locus_srt_set.add(srt_seq)
# add a hop record to the qbed DataFrame
qbed_df = qbed_df.append(
pd.Series([chr, start, end, count,
strand, srt_seq],
index=self.qbed_fields),
ignore_index=True)
# count single/multi srt as appropriate
if len(hop_annotation_set) > 1:
if len(locus_srt_set) > 1:
multi_srt_counter += 1
else:
single_srt_counter += 1
Expand Down Expand Up @@ -334,14 +345,14 @@ def write(self,
qc_output_file = filename + '_' + suffix + '_aln_summary.tsv' \
if suffix else filename + '_aln_summary.tsv'
logger.info("writing qc summary to %s", qc_output_file)
status_df.to_csv(qc_output_file,
sep='\t',
index=False,
status_df.to_csv(qc_output_file,
sep='\t',
index=False,
header=False)

srt_output_file = filename + '_' + suffix + '_srt_count.tsv' \
if suffix else filename + '_srt_count.tsv'
logger.info("writing srt summary to %s", srt_output_file)
self._srt_writer(srt_output_file,
single_srt_counter,
self._srt_writer(srt_output_file,
single_srt_counter,
multi_srt_counter)
Loading

0 comments on commit 613e6ef

Please sign in to comment.