Skip to content

Commit f033280

Browse files
committed
Move cdr3_from_junction into separate function
1 parent 7ae2d7f commit f033280

File tree

2 files changed

+51
-25
lines changed

2 files changed

+51
-25
lines changed

scirpy/io/_io.py

+33-24
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,29 @@
4949
DEFAULT_AIRR_CELL_ATTRIBUTES = ("is_cell", "high_confidence", "multi_chain")
5050

5151

52+
def _cdr3_from_junction(junction_aa, junction_nt):
53+
"""CDR3 euqals junction without the conserved residues C and W/F, respectively.
54+
Should the conserved residues not equal to C and W/F, then the chain
55+
is non-productive and we set CDR3 to None.
56+
57+
See also https://github.com/icbi-lab/scirpy/pull/290.
58+
"""
59+
cdr3_aa, cdr3_nt = None, None
60+
if (
61+
junction_aa is not None
62+
and junction_aa[0] == "C"
63+
and junction_aa[-1] in ("W", "F")
64+
):
65+
cdr3_aa = junction_aa[1:-1]
66+
if (
67+
junction_nt is not None
68+
and _translate_dna_to_protein(junction_nt[:3]) == "C"
69+
and _translate_dna_to_protein(junction_nt[-3:]) in ("W", "F")
70+
):
71+
cdr3_nt = junction_nt[3:-3]
72+
return cdr3_aa, cdr3_nt
73+
74+
5275
def _read_10x_vdj_json(
5376
path: Union[str, Path],
5477
filtered: bool = True,
@@ -151,15 +174,9 @@ def _read_10x_vdj_json(
151174
chain[col] = cell[col].get("nt_seq") if cell[col] else None
152175
chain[col + "_aa"] = cell[col].get("aa_seq") if cell[col] else None
153176

154-
# trim cdr3 if starts with "C" and ends with W/F
155-
chain["cdr3_aa"] = (
156-
chain["junction_aa"][1:-1]
157-
if chain["junction_aa"] is not None
158-
and chain["junction_aa"][0] == "C"
159-
and chain["junction_aa"][-1] in "WF"
160-
else None
177+
chain["cdr3_aa"], chain["cdr3"] = _cdr3_from_junction(
178+
chain["junction_aa"], chain["junction"]
161179
)
162-
chain["cdr3"] = chain["junction"][3:-3] if chain["cdr3_aa"] else None
163180

164181
ir_obj.add_chain(chain)
165182

@@ -209,16 +226,8 @@ def _read_10x_vdj_csv(
209226
if col + "_nt" in chain_series.index:
210227
chain_dict[col] = chain_series.get(col + "_nt")
211228

212-
# trim cdr3 if starts with "C" and ends with W/F
213-
chain_dict["cdr3_aa"] = (
214-
chain_dict["junction_aa"][1:-1]
215-
if not pd.isna(chain_dict["junction_aa"])
216-
and chain_dict["junction_aa"][0] == "C"
217-
and chain_dict["junction_aa"][-1] in "WF"
218-
else None
219-
)
220-
chain_dict["cdr3"] = (
221-
chain_dict["junction"][3:-3] if chain_dict["cdr3_aa"] else None
229+
chain_dict["cdr3_aa"], chain_dict["cdr3"] = _cdr3_from_junction(
230+
chain_dict["junction_aa"], chain_dict["junction"]
222231
)
223232

224233
ir_obj.add_chain(chain_dict)
@@ -254,12 +263,12 @@ def read_10x_vdj(
254263
filtered
255264
Only keep filtered contig annotations (i.e. `is_cell` and `high_confidence`).
256265
If using `filtered_contig_annotations.csv` already, this option
257-
include_fields
258-
The fields to include in `adata`. The AIRR rearrangment schema contains
259-
can contain a lot of columns, most of which irrelevant for most analyses.
260-
Per default, this includes a subset of columns relevant for a typical
261-
scirpy analysis, to keep `adata.obs` a bit cleaner. Defaults to {include_fields}.
262-
Set this to `None` to include all columns.
266+
include_fields
267+
The fields to include in `adata`. The AIRR rearrangment schema contains
268+
can contain a lot of columns, most of which irrelevant for most analyses.
269+
Per default, this includes a subset of columns relevant for a typical
270+
scirpy analysis, to keep `adata.obs` a bit cleaner. Defaults to {include_fields}.
271+
Set this to `None` to include all columns.
263272
is futile.
264273
265274

scirpy/tests/test_io.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
upgrade_schema,
1313
AirrCell,
1414
)
15-
from scirpy.io._io import _infer_locus_from_gene_names
15+
from scirpy.io._io import _infer_locus_from_gene_names, _cdr3_from_junction
1616
from scirpy.io._util import _check_upgrade_schema
1717
from scirpy.util import _is_na, _is_false
1818
import numpy as np
@@ -49,6 +49,23 @@ def anndata_from_10x_sample(request):
4949
return _read_anndata_from_10x_sample(request.param).copy()
5050

5151

52+
@pytest.mark.parametrize(
53+
"junction_aa,junction_nt,cdr3_aa,cdr3_nt",
54+
[
55+
(
56+
"CQQYGSSLTWTF",
57+
"TGTCAGCAGTATGGTAGCTCACTTACGTGGACGTTC",
58+
"QQYGSSLTWT",
59+
"CAGCAGTATGGTAGCTCACTTACGTGGACG",
60+
),
61+
("CYSHSPTSMWVS", "TGCTACTCACATTCACCTACTAGCATGTGGGTGTCC", None, None),
62+
(None, None, None, None),
63+
],
64+
)
65+
def test_cdr3_from_junction(junction_aa, junction_nt, cdr3_aa, cdr3_nt):
66+
assert _cdr3_from_junction(junction_aa, junction_nt) == (cdr3_aa, cdr3_nt)
67+
68+
5269
@pytest.mark.parametrize(
5370
"anndata_from_10x_sample",
5471
[

0 commit comments

Comments
 (0)