Skip to content

Commit f258588

Browse files
authored
Merge pull request #281 from icbi-lab/issue-280
Fix #280
2 parents 4ad93bd + 05f38b9 commit f258588

File tree

5 files changed

+31
-6
lines changed

5 files changed

+31
-6
lines changed

scirpy/io/_datastructures.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,17 @@ def _split_chains(self) -> Tuple[bool, dict]:
184184
for tmp_chain in self.chains:
185185
if "locus" not in tmp_chain:
186186
split_chains["extra"].append(tmp_chain)
187-
elif tmp_chain["locus"] in self.VJ_LOCI and tmp_chain["productive"]:
187+
elif (
188+
tmp_chain["locus"] in self.VJ_LOCI
189+
and tmp_chain["productive"]
190+
and not _is_na2(tmp_chain["junction_aa"])
191+
):
188192
split_chains["VJ"].append(tmp_chain)
189-
elif tmp_chain["locus"] in self.VDJ_LOCI and tmp_chain["productive"]:
193+
elif (
194+
tmp_chain["locus"] in self.VDJ_LOCI
195+
and tmp_chain["productive"]
196+
and not _is_na2(tmp_chain["junction_aa"])
197+
):
190198
split_chains["VDJ"].append(tmp_chain)
191199
else:
192200
split_chains["extra"].append(tmp_chain)

scirpy/io/_io.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,9 @@ def read_airr(
349349
* at least one of `junction_aa` or `junction`.
350350
351351
Data should still import if one of these fields is missing, but they are required
352-
by most of scirpy's processing functions.
352+
by most of scirpy's processing functions. All chains for which the field
353+
`junction_aa` is missing or empty, will be considered as non-productive and
354+
will be moved to the `extra_chains` column.
353355
354356
{doc_working_model}
355357

scirpy/io/_util.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@
1313
and cells flagged as :term:`Multichain-cell`.
1414
* Non-productive chains are ignored.
1515
* Chain loci must be valid :term:`IGMT locus names<Chain locus>`.
16-
* Excess chains, non-productive chains, or chains with invalid loci
17-
are serialized to JSON and stored in the `extra_chains` column. They are not
18-
used by scirpy except when exporting the `AnnData` object to AIRR format.
16+
* Excess chains, non-productive chains, chains without a CDR3 sequence,
17+
or chains with invalid loci are serialized to JSON and stored in the
18+
`extra_chains` column. They are not used by scirpy except when exporting
19+
the `AnnData` object to AIRR format.
1920
2021
For more information, see :ref:`receptor-model`.
2122
"""
+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
cell_id sequence_id UMI_sequence consensus_count duplicate_count sequence locus stop_codon vj_in_frame productive rev_comp v_call d_call j_call sequence_alignment germline_alignment sequence_alignment_aa germline_alignment_aa v_alignment_start v_alignment_end d_alignment_start d_alignment_end j_alignment_start j_alignment_end v_sequence_alignment v_sequence_alignment_aa v_germline_alignment v_germline_alignment_aa d_sequence_alignment d_sequence_alignment_aa d_germline_alignment d_germline_alignment_aa j_sequence_alignment j_sequence_alignment_aa j_germline_alignment j_germline_alignment_aa fwr1 fwr1_aa cdr1 cdr1_aa fwr2 fwr2_aa cdr2 cdr2_aa fwr3 fwr3_aa fwr4 fwr4_aa cdr3 cdr3_aa junction junction_length junction_aa junction_aa_length v_score d_score j_score v_cigar d_cigar j_cigar v_support d_support j_support v_identity d_identity j_identity v_sequence_start v_sequence_end v_germline_start v_germline_end d_sequence_start d_sequence_end d_germline_start d_germline_end j_sequence_start j_sequence_end j_germline_start j_germline_end fwr1_start fwr1_end cdr1_start cdr1_end fwr2_start fwr2_end cdr2_start cdr2_end fwr3_start fwr3_end fwr4_start fwr4_end cdr3_start cdr3_end np1 np1_length np2 np2_length
2+
AACCGAAAGCT CAATCGGT|CELL_ID=AACCGAAAGCT|PRFREQ=0.9390439525184472|CONSCOUNT=2927|DUPCOUNT=1 CAATCGGT 2927 1 TCAGTCCAACAGTTTGATGACTATCACTCTGAGATGAACATGAGTGCCTTGGAGCTAGAGGACTCTGCCGTGTACTTCTGTGCCAGCTCTCTCGGGGGGGAGAGTCAAAACACCTTGTACTTTGGTGCGGGCACCCGACTATCGGTGCTAG TRB F T T T TRBV12-2*01 TRBD2*01 TRBJ2-4*01 TCAGTCCAACAGTTTGATGACTATCACTCTGAGATGAACATGAGTGCCTTGGAGCTAGAGGACTCTGCCGTGTACTTCTGTGCCAGCTCTCTCGGGGGGGAGAGTCAAAACACCTTGTACTTTGGTGCGGGCACCCGACTATCGGTGCTAG TCAGTCCAACAGTTTGATGACTATCACTCTGAGATGAACATGAGTGCCTTGGAGCTAGAGGACTCTGCCGTGTACTTCTGTGCCAGCTCTCTCGGGGGGGNNAGTCAAAACACCTTGTACTTTGGTGCGGGCACCCGACTATCGGTGCTAG SVQQFDDYHSEMNMSALELEDSAVYFCASSLGGESQNTLYFGAGTRLSVL SVQQFDDYHSEMNMSALELEDSAVYFCASSLGGXSQNTLYFGAGTRLSVL 1 93 94 100 103 151 TCAGTCCAACAGTTTGATGACTATCACTCTGAGATGAACATGAGTGCCTTGGAGCTAGAGGACTCTGCCGTGTACTTCTGTGCCAGCTCTCTC SVQQFDDYHSEMNMSALELEDSAVYFCASSL TCAGTCCAACAGTTTGATGACTATCACTCTGAGATGAACATGAGTGCCTTGGAGCTAGAGGACTCTGCCGTGTACTTCTGTGCCAGCTCTCTC SVQQFDDYHSEMNMSALELEDSAVYFCASSL GGGGGGG GG GGGGGGG GG AGTCAAAACACCTTGTACTTTGGTGCGGGCACCCGACTATCGGTGCTAG SQNTLYFGAGTRLSVL AGTCAAAACACCTTGTACTTTGGTGCGGGCACCCGACTATCGGTGCTAG SQNTLYFGAGTRLSVL TCAGTCCAACAGTTTGATGACTATCACTCTGAGATGAACATGAGTGCCTTGGAGCTAGAGGACTCTGCCGTGTACTTCTGT SVQQFDDYHSEMNMSALELEDSAVYFC TTTGGTGCGGGCACCCGACTATCGGTGCTA FGAGTRLSVL GCCAGCTCTCTCGGGGGGGAGAGTCAAAACACCTTGTAC ASSLGGESQNTLY TGTGCCAGCTCTCTCGGGGGGGAGAGTCAAAACACCTTGTACTTT 45 CASSLGGESQNTLYF 15 146.741 14.146 94.899 192N93M58S 93S6N7M51S1N 102S49M 7.31E-35 2.16E+02 1.65E-20 100.000 100.000 100.000 1 93 193 285 94 100 7 13 103 151 1 49 1 81 121 150 82 120 0 AG 2
3+
AACCGAAAGCT CATACACG|CELL_ID=AACCGAAAGCT|PRFREQ=0.88|CONSCOUNT=785|DUPCOUNT=4 CATACACG 785 4 GGACGATTCACAAACTTCTTCAATAAAAGGGAGAAAAAGCTCTCCTTGCACATCACAGACTCTCAGCCTGGAGACTCAGCTACCTACTTCTGTGCAGCAAGGGGTAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA TRA F T T T "TRAV14-1*01,TRAV14-2*01,TRAV14-2*03" "TRAJ31*01,TRAJ31*02" GGACGATTCACAAACTTCTTCAATAAAAGGGAGAAAAAGCTCTCCTTGCACATCACAGACTCTCAGCCTGGAGACTCAGCTACCTACTTCTGTGCAGCAAGGGGTAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA GGACGATTCACAATCTTCTTCAATAAAAGGGAGAAAAAGCTCTCCTTGCACATCACAGACTCTCAGCCTGGAGACTCAGCTACCTACTTCTGTGCAGCAAGNNNTAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA GRFTNFFNKREKKLSLHITDSQPGDSATYFCAARGNRIFFGDGTQLVVKP GRFTIFFNKREKKLSLHITDSQPGDSATYFCAAXXNRIFFGDGTQLVVKP 1 101 105 151 GGACGATTCACAAACTTCTTCAATAAAAGGGAGAAAAAGCTCTCCTTGCACATCACAGACTCTCAGCCTGGAGACTCAGCTACCTACTTCTGTGCAGCAAG GRFTNFFNKREKKLSLHITDSQPGDSATYFCAA GGACGATTCACAATCTTCTTCAATAAAAGGGAGAAAAAGCTCTCCTTGCACATCACAGACTCTCAGCCTGGAGACTCAGCTACCTACTTCTGTGCAGCAAG GRFTIFFNKREKKLSLHITDSQPGDSATYFCAA TAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA NRIFFGDGTQLVVKP TAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA NRIFFGDGTQLVVKP GGACGATTCACAAACTTCTTCAATAAAAGGGAGAAAAAGCTCTCCTTGCACATCACAGACTCTCAGCCTGGAGACTCAGCTACCTACTTCTGT GRFTNFFNKREKKLSLHITDSQPGDSATYFC TTTGGTGATGGGACGCAGCTGGTGGTGAAGCCC FGDGTQLVVKP GCAGCAAGGGGTAACAGAATCTTC AARGNRIF TGTGCAGCAAGGGGTAACAGAATCTTCTTT 30 CAARGNRIFF 10 156.089 91.054 177N101M50S2N 104S10N47M 1.12E-37 2.36E-19 99.010 100.000 1 101 178 278 105 151 11 57 1 93 118 150 94 117 GGG 3
4+
AACCGAAAGCT CGTTACCC|CELL_ID=AACCGAAAGCT|PRFREQ=0.8305084745762712|CONSCOUNT=49|DUPCOUNT=1 CGTTACCC 49 1 AGCTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAGGACAGCAAGGGGTAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA TRA T F F T "TRAV14-1*01,TRAV14N-1*01" "TRAJ31*01,TRAJ31*02" CTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAGGACAGCAAGGGGTAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA CTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAGGACAGCANNNNNTAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA LTVWEGETAILNCSYEDSKG*QNLLW*WDAAGGEA LTVWEGETAILNCSYEDSXX*QNLLW*WDAAGGEA 1 57 63 109 CTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAGGACAGCA LTVWEGETAILNCSYEDS CTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAGGACAGCA LTVWEGETAILNCSYEDS TAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA *QNLLW*WDAAGGEA TAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA *QNLLW*WDAAGGEA CTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAG LTVWEGETAILNCSYE GACAGCA DS 90.649 91.054 2S28N57M52S195N 64S10N47M 3.89E-18 1.69E-19 100.000 100.000 3 59 29 85 65 111 11 57 3 52 53 59 AGGGG 5
5+
AACCGAAAGCT CAGATGAC|CELL_ID=AACCGAAAGCT|PRFREQ=0.9730586370839936|CONSCOUNT=8396|DUPCOUNT=2 CAGATGAC 8396 2 GCTCGTGGGCTCGGNGATGNGTATAAGGGACAGGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCACATATAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG TRB F T T T "TRBV16*01,TRBV16*02,TRBV16*04" TRBJ2-1*01 AAGGGACAGGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCACATATAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG AAGGGACAAGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCANNNNTAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG KGQEATLWCEPISGHSAVFWYRQTTYNYAEQFFGPGTRLTVL KGQEATLWCEPISGHSAVFWYRQTXXNYAEQFFGPGTRLTVL 1 73 78 127 AAGGGACAGGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCA KGQEATLWCEPISGHSAVFWYRQT AAGGGACAAGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCA KGQEATLWCEPISGHSAVFWYRQT TAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG NYAEQFFGPGTRLTVL TAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG NYAEQFFGPGTRLTVL AAGGGACAGGAAGCAACTCTGTGGTGTGAGCCAATT KGQEATLWCEPI TCAGGACATAGTGCT SGHSA GTTTTCTGGTACAGACAGACCA VFWYRQT 112.462 96.822 24S42N73M54S175N 101S50M 1.52E-24 4.34E-21 98.630 100.000 25 97 43 115 102 151 1 50 25 60 61 75 76 97 CATA 4
6+
AACCGAAAGCT CAGTGCTC|CELL_ID=AACCGAAAGCT|PRFREQ=0.9697986577181208|CONSCOUNT=578|DUPCOUNT=1 CAGTGCTC 578 1 NCTCGTNGGCTCGGNGATGTGTATAAGNGACAGGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCACATATAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG TRB F T T T "TRBV16*01,TRBV16*02,TRBV16*04" TRBJ2-1*01 AAGNGACAGGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCACATATAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG AAGGGACAAGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCANNNNTAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG KXQEATLWCEPISGHSAVFWYRQTTYNYAEQFFGPGTRLTVL KGQEATLWCEPISGHSAVFWYRQTXXNYAEQFFGPGTRLTVL 1 73 78 127 AAGNGACAGGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCA KXQEATLWCEPISGHSAVFWYRQT AAGGGACAAGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCA KGQEATLWCEPISGHSAVFWYRQT TAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG NYAEQFFGPGTRLTVL TAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG NYAEQFFGPGTRLTVL AAGNGACAGGAAGCAACTCTGTGGTGTGAGCCAATT KXQEATLWCEPI TCAGGACATAGTGCT SGHSA GTTTTCTGGTACAGACAGACCA VFWYRQT 109.346 96.822 24S42N73M54S175N 101S50M 1.32E-23 4.34E-21 97.260 100.000 25 97 43 115 102 151 1 50 25 60 61 75 76 97 CATA 4

scirpy/tests/test_io.py

+8
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,14 @@ def test_read_tracer():
347347
assert cell2["IR_VDJ_1_j_call"] == "TRBJ2-5"
348348

349349

350+
@pytest.mark.conda
351+
def test_read_airr_issue280():
352+
"""Test that reading the example shown in issue #280 works."""
353+
anndata = read_airr(TESTDATA / "airr" / "tra_issue_280.tsv")
354+
assert anndata.obs["IR_VDJ_1_junction_aa"][0] == "CASSLGGESQNTLYF"
355+
assert anndata.obs["IR_VJ_1_junction_aa"][0] == "CAARGNRIFF"
356+
357+
350358
@pytest.mark.conda
351359
def test_read_airr():
352360
# Test that reading the files one-by-one or at once yields the same results

0 commit comments

Comments
 (0)