|
49 | 49 | DEFAULT_AIRR_CELL_ATTRIBUTES = ("is_cell", "high_confidence", "multi_chain")
|
50 | 50 |
|
51 | 51 |
|
| 52 | +def _cdr3_from_junction(junction_aa, junction_nt): |
| 53 | + """CDR3 euqals junction without the conserved residues C and W/F, respectively. |
| 54 | + Should the conserved residues not equal to C and W/F, then the chain |
| 55 | + is non-productive and we set CDR3 to None. |
| 56 | +
|
| 57 | + See also https://github.com/icbi-lab/scirpy/pull/290. |
| 58 | + """ |
| 59 | + cdr3_aa, cdr3_nt = None, None |
| 60 | + if ( |
| 61 | + junction_aa is not None |
| 62 | + and junction_aa[0] == "C" |
| 63 | + and junction_aa[-1] in ("W", "F") |
| 64 | + ): |
| 65 | + cdr3_aa = junction_aa[1:-1] |
| 66 | + if ( |
| 67 | + junction_nt is not None |
| 68 | + and _translate_dna_to_protein(junction_nt[:3]) == "C" |
| 69 | + and _translate_dna_to_protein(junction_nt[-3:]) in ("W", "F") |
| 70 | + ): |
| 71 | + cdr3_nt = junction_nt[3:-3] |
| 72 | + return cdr3_aa, cdr3_nt |
| 73 | + |
| 74 | + |
52 | 75 | def _read_10x_vdj_json(
|
53 | 76 | path: Union[str, Path],
|
54 | 77 | filtered: bool = True,
|
@@ -151,15 +174,9 @@ def _read_10x_vdj_json(
|
151 | 174 | chain[col] = cell[col].get("nt_seq") if cell[col] else None
|
152 | 175 | chain[col + "_aa"] = cell[col].get("aa_seq") if cell[col] else None
|
153 | 176 |
|
154 |
| - # trim cdr3 if starts with "C" and ends with W/F |
155 |
| - chain["cdr3_aa"] = ( |
156 |
| - chain["junction_aa"][1:-1] |
157 |
| - if chain["junction_aa"] is not None |
158 |
| - and chain["junction_aa"][0] == "C" |
159 |
| - and chain["junction_aa"][-1] in "WF" |
160 |
| - else None |
| 177 | + chain["cdr3_aa"], chain["cdr3"] = _cdr3_from_junction( |
| 178 | + chain["junction_aa"], chain["junction"] |
161 | 179 | )
|
162 |
| - chain["cdr3"] = chain["junction"][3:-3] if chain["cdr3_aa"] else None |
163 | 180 |
|
164 | 181 | ir_obj.add_chain(chain)
|
165 | 182 |
|
@@ -209,16 +226,8 @@ def _read_10x_vdj_csv(
|
209 | 226 | if col + "_nt" in chain_series.index:
|
210 | 227 | chain_dict[col] = chain_series.get(col + "_nt")
|
211 | 228 |
|
212 |
| - # trim cdr3 if starts with "C" and ends with W/F |
213 |
| - chain_dict["cdr3_aa"] = ( |
214 |
| - chain_dict["junction_aa"][1:-1] |
215 |
| - if not pd.isna(chain_dict["junction_aa"]) |
216 |
| - and chain_dict["junction_aa"][0] == "C" |
217 |
| - and chain_dict["junction_aa"][-1] in "WF" |
218 |
| - else None |
219 |
| - ) |
220 |
| - chain_dict["cdr3"] = ( |
221 |
| - chain_dict["junction"][3:-3] if chain_dict["cdr3_aa"] else None |
| 229 | + chain_dict["cdr3_aa"], chain_dict["cdr3"] = _cdr3_from_junction( |
| 230 | + chain_dict["junction_aa"], chain_dict["junction"] |
222 | 231 | )
|
223 | 232 |
|
224 | 233 | ir_obj.add_chain(chain_dict)
|
@@ -254,12 +263,12 @@ def read_10x_vdj(
|
254 | 263 | filtered
|
255 | 264 | Only keep filtered contig annotations (i.e. `is_cell` and `high_confidence`).
|
256 | 265 | If using `filtered_contig_annotations.csv` already, this option
|
257 |
| - include_fields |
258 |
| - The fields to include in `adata`. The AIRR rearrangment schema contains |
259 |
| - can contain a lot of columns, most of which irrelevant for most analyses. |
260 |
| - Per default, this includes a subset of columns relevant for a typical |
261 |
| - scirpy analysis, to keep `adata.obs` a bit cleaner. Defaults to {include_fields}. |
262 |
| - Set this to `None` to include all columns. |
| 266 | + include_fields |
| 267 | + The fields to include in `adata`. The AIRR rearrangment schema contains |
| 268 | + can contain a lot of columns, most of which irrelevant for most analyses. |
| 269 | + Per default, this includes a subset of columns relevant for a typical |
| 270 | + scirpy analysis, to keep `adata.obs` a bit cleaner. Defaults to {include_fields}. |
| 271 | + Set this to `None` to include all columns. |
263 | 272 | is futile.
|
264 | 273 |
|
265 | 274 |
|
|
0 commit comments