Merge pull request #109 from icbi-lab/10x-io-test

grst · web-flow · commit 90ba34bf8b05 · 2020-04-14T12:15:53.000+02:00
Merge read_10x_vdj and read_10x_vdj_csv.
diff --git a/docs/api.rst b/docs/api.rst
@@ -17,7 +17,6 @@ Input/Output
 
    read_h5ad
    read_10x_vdj
-   read_10x_vdj_csv
    read_tracer
 
 
diff --git a/scirpy/__init__.py b/scirpy/__init__.py
@@ -7,7 +7,7 @@
 __author__ = ", ".join(["Gregor Sturm", "Tamas Szabo"])
 
 from scanpy import AnnData, read_h5ad
-from ._io import read_10x_vdj, read_tracer, read_10x_vdj_csv
+from ._io import read_10x_vdj, read_tracer
 from . import _preprocessing as pp
 from . import _tools as tl
 from . import _plotting as pl
diff --git a/scirpy/_io/_io.py b/scirpy/_io/_io.py
@@ -74,9 +74,11 @@ def _process_tcr_cell(tcr_obj: TcrCell) -> dict:
     res_dict["cell_id"] = tcr_obj.cell_id
     chain_dict = dict()
     for c in ["TRA", "TRB"]:
+        # sorting subordinately by raw and cdr3 ensures consistency
+        # between load from json and load from csv.
         tmp_chains = sorted(
             [x for x in tcr_obj.chains if x.chain_type == c and x.is_productive],
-            key=lambda x: x.expr,
+            key=lambda x: (x.expr, x.expr_raw, x.cdr3),
             reverse=True,
         )
         res_dict["multi_chain"] = res_dict.get("multi_chain", False) | (
@@ -126,23 +128,8 @@ def _process_tcr_cell(tcr_obj: TcrCell) -> dict:
     return res_dict
 
 
-@_doc_params(doc_working_model=doc_working_model)
-def read_10x_vdj(path: str, filtered: bool = True) -> AnnData:
-    """Read TCR data from a 10x genomics sample.
-
-    {doc_working_model}
-    
-    Parameters
-    ----------
-    path
-        Path to all_contig_annotations.json
-    filtered
-        Only keep filtered contig annotations (= is_cell and high_confidence)
-
-    Returns
-    -------
-    AnnData object with TCR data in `obs` for each cell.  
-    """
+def _read_10x_vdj_json(path: str, filtered: bool = True) -> AnnData:
+    """Read TCR data from a 10x genomics `all_contig_annotations.json` file"""
     with open(path, "r") as f:
         cells = json.load(f)
 
@@ -230,29 +217,8 @@ def read_10x_vdj(path: str, filtered: bool = True) -> AnnData:
     return _tcr_objs_to_anndata(tcr_objs.values())
 
 
-@_doc_params(doc_working_model=doc_working_model)
-def read_10x_vdj_csv(path: str, filtered: bool = True) -> AnnData:
-    """Read TCR data from a 10x genomics `_contig_annotations.csv` file
-
-    If the `all_contig_annotations.json` file is available it is perferable! 
-    For instance, the `csv` file does not contain information about
-    junctions. 
-
-    {doc_working_model}
-
-    Parameters
-    ----------
-    path
-        Path to filterd_contig_annotations.csv or all_contig_annotationgs.csv
-    filtered
-        Only keep filtered contig annotations (= is_cell and high_confidence)
-        If using `filtered_contig_annotations.csv` already, this option
-        is futile. 
-
-    Returns
-    -------
-    AnnData object with TCR data in `obs` for each cell. 
-    """
+def _read_10x_vdj_csv(path: str, filtered: bool = True) -> AnnData:
+    """Read TCR data from a 10x genomics `_contig_annotations.csv` file """
     df = pd.read_csv(path)
 
     tcr_objs = {}
@@ -283,6 +249,35 @@ def read_10x_vdj_csv(path: str, filtered: bool = True) -> AnnData:
     return _tcr_objs_to_anndata(tcr_objs.values())
 
 
+@_doc_params(doc_working_model=doc_working_model)
+def read_10x_vdj(path: str, filtered: bool = True) -> AnnData:
+    """Read TCR data from 10x Genomics cell-ranger output. 
+
+    Supports `all_contig_annotations.json` and `{{all,filtered}}_contig_annotations.csv`. 
+    If the json file is available, it is preferable as it additionally 
+    contains information about VDJ-junction insertions. 
+
+    {doc_working_model}
+
+    Parameters
+    ----------
+    path
+        Path to filterd_contig_annotations.csv or all_contig_annotationgs.csv
+    filtered
+        Only keep filtered contig annotations (= is_cell and high_confidence)
+        If using `filtered_contig_annotations.csv` already, this option
+        is futile. 
+
+    Returns
+    -------
+    AnnData object with TCR data in `obs` for each cell.  
+    """
+    if path.endswith("json"):
+        return _read_10x_vdj_json(path, filtered)
+    else:
+        return _read_10x_vdj_csv(path, filtered)
+
+
 @_doc_params(doc_working_model=doc_working_model)
 def read_tracer(path: str) -> AnnData:
     """Read data from TraCeR. 
diff --git a/tests/data/10x/vdj_nextgem_hs_pbmc3_t_filtered_contig_annotations.csv.gz b/tests/data/10x/vdj_nextgem_hs_pbmc3_t_filtered_contig_annotations.csv.gz
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -1,19 +1,26 @@
-from scirpy import read_10x_vdj, read_tracer, read_10x_vdj_csv
+from scirpy import read_10x_vdj, read_tracer
 from scirpy._util import _is_na, _is_false
 import numpy as np
 import pytest
 
 
-# def test_read_10x_example():
-#     anndata = read_10x_vdj("tutorial/example_data/10x/all_contig_annotations.json")
+def test_read_10x_example():
+    """Test that a full 10x CSV table can be imported without errors.
+
+    Test-dataset from https://support.10xgenomics.com/single-cell-vdj/datasets/3.1.0/vdj_nextgem_hs_pbmc3
+    under CC-BY-4.0
+    """
+    anndata = read_10x_vdj(
+        "tests/data/10x/vdj_nextgem_hs_pbmc3_t_filtered_contig_annotations.csv.gz"
+    )
 
 
 # def test_read_tracer_example():
 #     anndata = read_tracer("tutorial/example_data/tracer/tracer_100")
 
 
 def test_read_10x_csv():
-    anndata = read_10x_vdj_csv("tests/data/10x/filtered_contig_annotations.csv")
+    anndata = read_10x_vdj("tests/data/10x/filtered_contig_annotations.csv")
     obs = anndata.obs
     assert obs.shape[0] == 4
     cell1 = obs.iloc[1, :]