From ca92568790967e61a9068aea20b95af9d7524b46 Mon Sep 17 00:00:00 2001 From: Dominik Klein Date: Mon, 26 Aug 2024 12:23:20 +0200 Subject: [PATCH 01/12] add pancreas dataset --- docs/conf.py | 1 + pyproject.toml | 3 +- src/moscot/datasets.py | 118 ++++++++++++++++++++++++++++++++--------- 3 files changed, 96 insertions(+), 26 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 311d1f6b2..34fe851cd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -55,6 +55,7 @@ "anndata": ("https://anndata.readthedocs.io/en/latest/", None), "scanpy": ("https://scanpy.readthedocs.io/en/latest/", None), "squidpy": ("https://squidpy.readthedocs.io/en/latest/", None), + "mudata": ("https://mudata.readthedocs.io/en/latest/", None), } master_doc = "index" pygments_style = "tango" diff --git a/pyproject.toml b/pyproject.toml index 014775ffe..f3b5e3291 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,8 @@ dependencies = [ "ott-jax[neural]>=0.4.6", "cloudpickle>=2.2.0", "rich>=13.5", - "docstring_inheritance>=2.0.0" + "docstring_inheritance>=2.0.0", + "mudata>=0.3.0" ] [project.optional-dependencies] diff --git a/src/moscot/datasets.py b/src/moscot/datasets.py index a308d5867..b2c341807 100644 --- a/src/moscot/datasets.py +++ b/src/moscot/datasets.py @@ -7,7 +7,10 @@ import urllib.request from itertools import combinations from types import MappingProxyType -from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple +from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Union + +import mudata +import mudata as mu import networkx as nx import numpy as np @@ -15,8 +18,6 @@ from scipy.linalg import block_diag import anndata as ad -from anndata import AnnData -from scanpy import read from moscot._types import PathLike @@ -36,7 +37,7 @@ def mosta( path: PathLike = "~/.cache/moscot/mosta.h5ad", force_download: bool = False, **kwargs: Any, -) -> AnnData: # pragma: no cover +) -> ad.AnnData: # pragma: no cover """Preprocessed and extracted data as provided in :cite:`chen:22`. Includes embryo sections `E9.5`, `E2S1`, `E10.5`, `E2S1`, `E11.5`, `E1S2`. @@ -59,6 +60,7 @@ def mosta( """ return _load_dataset_from_url( path, + type="h5ad", backup_url="https://figshare.com/ndownloader/files/40569779", expected_shape=(54134, 2000), force_download=force_download, @@ -70,7 +72,7 @@ def hspc( path: PathLike = "~/.cache/moscot/hspc.h5ad", force_download: bool = False, **kwargs: Any, -) -> AnnData: # pragma: no cover +) -> ad.AnnData: # pragma: no cover """CD34+ hematopoietic stem and progenitor cells from 4 healthy human donors. From the `NeurIPS Multimodal Single-Cell Integration Challenge @@ -95,6 +97,7 @@ def hspc( """ dataset = _load_dataset_from_url( path, + type="h5ad", backup_url="https://figshare.com/ndownloader/files/37993503", expected_shape=(4000, 2000), force_download=force_download, @@ -111,7 +114,7 @@ def drosophila( spatial: bool, force_download: bool = False, **kwargs: Any, -) -> AnnData: +) -> ad.AnnData: """Embryo of Drosophila melanogaster described in :cite:`Li-spatial:22`. Minimal pre-processing was performed, such as gene and cell filtering, as well as normalization. @@ -135,6 +138,7 @@ def drosophila( if spatial: return _load_dataset_from_url( path + "_sp.h5ad", + type="h5ad", backup_url="https://figshare.com/ndownloader/files/37984935", expected_shape=(3039, 82), force_download=force_download, @@ -143,6 +147,7 @@ def drosophila( return _load_dataset_from_url( path + "_sc.h5ad", + type="h5ad", backup_url="https://figshare.com/ndownloader/files/37984938", expected_shape=(1297, 2000), force_download=force_download, @@ -154,7 +159,7 @@ def c_elegans( path: PathLike = "~/.cache/moscot/c_elegans.h5ad", force_download: bool = False, **kwargs: Any, -) -> Tuple[AnnData, nx.DiGraph]: # pragma: no cover +) -> Tuple[ad.AnnData, nx.DiGraph]: # pragma: no cover """scRNA-seq time-series dataset of C.elegans embryogenesis :cite:`packer:19`. Contains raw counts of 46,151 cells with at least partial lineage information. @@ -175,6 +180,7 @@ def c_elegans( """ adata = _load_dataset_from_url( path, + type="h5ad", backup_url="https://figshare.com/ndownloader/files/39943585", expected_shape=(46151, 20222), force_download=force_download, @@ -191,7 +197,7 @@ def zebrafish( path: PathLike = "~/.cache/moscot/zebrafish.h5ad", force_download: bool = False, **kwargs: Any, -) -> Tuple[AnnData, Dict[str, nx.DiGraph]]: +) -> Tuple[ad.AnnData, Dict[str, nx.DiGraph]]: """Lineage-traced scRNA-seq time-series dataset of Zebrafish heart regeneration :cite:`hu:22`. Contains gene expression vectors, LINNAEUS :cite:`spanjaard:18` reconstructed lineage trees, @@ -212,6 +218,7 @@ def zebrafish( """ adata = _load_dataset_from_url( path, + type="h5ad", backup_url="https://figshare.com/ndownloader/files/39951073", expected_shape=(44014, 31466), force_download=force_download, @@ -230,7 +237,7 @@ def bone_marrow( rna: bool, force_download: bool = False, **kwargs: Any, -) -> AnnData: +) -> ad.AnnData: """Multiome data of bone marrow measurements :cite:`luecken:21`. Contains processed counts of 6,224 cells. The RNA data was filtered to 2,000 top @@ -256,6 +263,7 @@ def bone_marrow( if rna: return _load_dataset_from_url( path + "_rna.h5ad", + type="h5ad", backup_url="https://figshare.com/ndownloader/files/40195114", expected_shape=(6224, 2000), force_download=force_download, @@ -263,6 +271,7 @@ def bone_marrow( ) return _load_dataset_from_url( path + "_atac.h5ad", + type="h5ad", backup_url="https://figshare.com/ndownloader/files/41013551", expected_shape=(6224, 8000), force_download=force_download, @@ -270,11 +279,56 @@ def bone_marrow( ) +def pancreas_multiome( + rna_only: bool, + path: PathLike = "~/.cache/moscot/pancreas_multiome.h5mu", + force_download: bool = True, + **kwargs: Any, +) -> Union[mu.MuData, ad.AnnData]: # pragma: no cover + """Pancreatic endocrinogenesis dataset published with the moscot manuscript :cite:`Klein:23`. + + The dataset contains paired scRNA-seq and scATAC-seq data of pancreatic cells at embryonic days 14.5, 15.5, and + 16.5. The data was preprocessed and filtered as described in the manuscript, the raw data and the full processed + data are available via GEO accession code GSE275562. + + Parameters + ---------- + rna_only + Only load the RNA data, resulting in a smaller file. + path + Path where to save the file. + force_download + Whether to force-download the data. + kwargs + Keyword arguments for :func:`anndata.read_h5ad` if `rna_only`, else for :func:`mudata.read`. + + Returns + ------- + :class:`mudata.MuData` object with RNA and ATAC data if `rna_only`, else :class:`anndata.AnnData` with RNA only. + """ + if rna_only: + return _load_dataset_from_url( + path, + type="h5ad", + backup_url="https://figshare.com/ndownloader/files/48785320", + expected_shape=(22604, 20242), + force_download=force_download, + **kwargs, + ) + return _load_dataset_from_url( + path, + type="h5mu", + backup_url="https://figshare.com/ndownloader/files/48782332", + expected_shape=(22604, 271918), + force_download=force_download, + ) + + def tedsim( path: PathLike = "~/.cache/moscot/tedsim.h5ad", force_download: bool = False, **kwargs: Any, -) -> AnnData: # pragma: no cover +) -> ad.AnnData: # pragma: no cover """Dataset simulated with TedSim :cite:`pan:22`. Simulated scRNA-seq dataset of a differentiation trajectory. For each cell, the dataset includes a (raw counts) @@ -302,6 +356,7 @@ def tedsim( """ return _load_dataset_from_url( path, + type="h5ad", backup_url="https://figshare.com/ndownloader/files/40178644", expected_shape=(8448, 500), force_download=force_download, @@ -313,7 +368,7 @@ def sciplex( path: PathLike = "~/.cache/moscot/sciplex.h5ad", force_download: bool = False, **kwargs: Any, -) -> AnnData: # pragma: no cover +) -> ad.AnnData: # pragma: no cover """Perturbation dataset published in :cite:`srivatsan:20`. Transcriptomes of A549, K562, and mCF7 cells exposed to 188 compounds. @@ -334,6 +389,7 @@ def sciplex( """ return _load_dataset_from_url( path, + type="h5ad", backup_url="https://figshare.com/ndownloader/files/43381398", expected_shape=(799317, 110984), force_download=force_download, @@ -345,7 +401,7 @@ def sim_align( path: PathLike = "~/.cache/moscot/sim_align.h5ad", force_download: bool = False, **kwargs: Any, -) -> AnnData: # pragma: no cover +) -> ad.AnnData: # pragma: no cover """Spatial transcriptomics simulated dataset as described in :cite:`Jones-spatial:22`. Parameters @@ -363,6 +419,7 @@ def sim_align( """ return _load_dataset_from_url( path, + type="h5ad", backup_url="https://figshare.com/ndownloader/files/37984926", expected_shape=(1200, 500), force_download=force_download, @@ -383,7 +440,7 @@ def simulate_data( lin_cost_matrix: Optional[str] = None, quad_cost_matrix: Optional[str] = None, **kwargs: Any, -) -> AnnData: +) -> ad.AnnData: """Simulate data. This function is used to generate data, mainly for the purpose of @@ -424,7 +481,7 @@ def simulate_data( """ rng = np.random.RandomState(seed) adatas = [ - AnnData( + ad.AnnData( X=rng.multivariate_normal( mean=kwargs.pop("mean", np.arange(n_genes)), cov=kwargs.pop("cov", var * np.diag(np.ones(n_genes))), @@ -477,32 +534,43 @@ def simulate_data( def _load_dataset_from_url( fpath: PathLike, + type: Literal["h5ad", "h5mu"], *, backup_url: str, expected_shape: Tuple[int, int], force_download: bool = False, - sparse: bool = True, - cache: bool = True, **kwargs: Any, -) -> AnnData: +) -> Union[ad.AnnData, mu.MuData]: + # TODO: make nicer once https://github.com/scverse/mudata/issues/76 resolved fpath = os.path.expanduser(fpath) - if not fpath.endswith(".h5ad"): + if type == "h5ad" and not fpath.endswith(".h5ad"): fpath += ".h5ad" - if force_download: + if type == "h5mu" and not fpath.endswith(".h5mu"): + fpath += ".h5mu" + + if not os.path.exists(fpath) or force_download: with tempfile.TemporaryDirectory() as tmpdir: - tmp = pathlib.Path(tmpdir) / "data.h5ad" - adata = read(filename=tmp, backup_url=backup_url, sparse=sparse, cache=cache, **kwargs) + tmp = pathlib.Path(tmpdir) / f"data.{type}" + urllib.request.urlretrieve(backup_url, tmp) + if type == "h5ad": + data = ad.read_h5ad(filename=tmp, **kwargs) + if type == "h5mu": + data = mudata.read(tmp, **kwargs) with contextlib.suppress(FileNotFoundError): os.remove(fpath) shutil.move(tmp, fpath) else: - adata = read(filename=fpath, backup_url=backup_url, sparse=sparse, cache=cache, **kwargs) + if type == "h5ad": + data = ad.read_h5ad(filename=fpath, **kwargs) + else: + raise NotImplementedError("MuData download only available with `force_download=True`.") - if adata.shape != expected_shape: - raise ValueError(f"Expected `AnnData` object to have shape `{expected_shape}`, found `{adata.shape}`.") + if data.shape != expected_shape: + data_str = "MuData" if type == "h5mu" else "AnnData" + raise ValueError(f"Expected {data_str} object to have shape `{expected_shape}`, found `{data.shape}`.") - return adata + return data def _get_random_trees( From 1ebc89cc6bdc71fd540b92dd92cd8caa9e45d933 Mon Sep 17 00:00:00 2001 From: Dominik Klein Date: Mon, 26 Aug 2024 12:25:17 +0200 Subject: [PATCH 02/12] fix logic --- src/moscot/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/moscot/datasets.py b/src/moscot/datasets.py index b2c341807..c172a8166 100644 --- a/src/moscot/datasets.py +++ b/src/moscot/datasets.py @@ -549,7 +549,7 @@ def _load_dataset_from_url( if type == "h5mu" and not fpath.endswith(".h5mu"): fpath += ".h5mu" - if not os.path.exists(fpath) or force_download: + if not os.path.exists(fpath): with tempfile.TemporaryDirectory() as tmpdir: tmp = pathlib.Path(tmpdir) / f"data.{type}" urllib.request.urlretrieve(backup_url, tmp) From c699dc955f7e3d992fa26c8440f8dfe67ed5a498 Mon Sep 17 00:00:00 2001 From: Dominik Klein Date: Mon, 26 Aug 2024 13:20:29 +0200 Subject: [PATCH 03/12] adapt mudata dep --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f3b5e3291..d816f3260 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,7 @@ dependencies = [ "cloudpickle>=2.2.0", "rich>=13.5", "docstring_inheritance>=2.0.0", - "mudata>=0.3.0" + "mudata>=0.2.0" ] [project.optional-dependencies] From b885e3cec2761470dc64bc0af4feb2481725deb7 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Mon, 9 Sep 2024 14:08:31 +0200 Subject: [PATCH 04/12] commit to test if it works with scanpy download function --- src/moscot/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/moscot/datasets.py b/src/moscot/datasets.py index c172a8166..03109ecc2 100644 --- a/src/moscot/datasets.py +++ b/src/moscot/datasets.py @@ -548,11 +548,11 @@ def _load_dataset_from_url( if type == "h5mu" and not fpath.endswith(".h5mu"): fpath += ".h5mu" - + from scanpy.readwrite import _check_datafile_present_and_download if not os.path.exists(fpath): with tempfile.TemporaryDirectory() as tmpdir: tmp = pathlib.Path(tmpdir) / f"data.{type}" - urllib.request.urlretrieve(backup_url, tmp) + _check_datafile_present_and_download(backup_url=backup_url, path=tmp) if type == "h5ad": data = ad.read_h5ad(filename=tmp, **kwargs) if type == "h5mu": From d9ccf6016e7fc9f9b17d177521ef53bf93d6fbc5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Sep 2024 12:08:50 +0000 Subject: [PATCH 05/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/moscot/datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/moscot/datasets.py b/src/moscot/datasets.py index 03109ecc2..54b54dc4c 100644 --- a/src/moscot/datasets.py +++ b/src/moscot/datasets.py @@ -549,6 +549,7 @@ def _load_dataset_from_url( if type == "h5mu" and not fpath.endswith(".h5mu"): fpath += ".h5mu" from scanpy.readwrite import _check_datafile_present_and_download + if not os.path.exists(fpath): with tempfile.TemporaryDirectory() as tmpdir: tmp = pathlib.Path(tmpdir) / f"data.{type}" From dc84ff743ec4ecd75cd4d8c78d087ae0deed461b Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Mon, 9 Sep 2024 14:22:25 +0200 Subject: [PATCH 06/12] check again --- src/moscot/datasets.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/src/moscot/datasets.py b/src/moscot/datasets.py index 54b54dc4c..a90208864 100644 --- a/src/moscot/datasets.py +++ b/src/moscot/datasets.py @@ -550,22 +550,10 @@ def _load_dataset_from_url( fpath += ".h5mu" from scanpy.readwrite import _check_datafile_present_and_download - if not os.path.exists(fpath): - with tempfile.TemporaryDirectory() as tmpdir: - tmp = pathlib.Path(tmpdir) / f"data.{type}" - _check_datafile_present_and_download(backup_url=backup_url, path=tmp) - if type == "h5ad": - data = ad.read_h5ad(filename=tmp, **kwargs) - if type == "h5mu": - data = mudata.read(tmp, **kwargs) - with contextlib.suppress(FileNotFoundError): - os.remove(fpath) - shutil.move(tmp, fpath) + if not os.path.exists(fpath) or force_download: + _check_datafile_present_and_download(backup_url=backup_url, path=fpath) else: - if type == "h5ad": - data = ad.read_h5ad(filename=fpath, **kwargs) - else: - raise NotImplementedError("MuData download only available with `force_download=True`.") + data = ad.read_h5ad(filename=fpath, **kwargs) if type == "h5ad" else mu.read_h5mu(filename=fpath, backed=False) if data.shape != expected_shape: data_str = "MuData" if type == "h5mu" else "AnnData" From db5b81d79f7c68b5e0fffb02d721d47bfe4f02ee Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Sep 2024 12:22:50 +0000 Subject: [PATCH 07/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/moscot/datasets.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/moscot/datasets.py b/src/moscot/datasets.py index a90208864..38c0cfe8a 100644 --- a/src/moscot/datasets.py +++ b/src/moscot/datasets.py @@ -1,15 +1,10 @@ -import contextlib import os -import pathlib import pickle -import shutil -import tempfile import urllib.request from itertools import combinations from types import MappingProxyType from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Union -import mudata import mudata as mu import networkx as nx From c028bc720d5edf76654305a028f6a57f81e9e06a Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Mon, 9 Sep 2024 14:25:56 +0200 Subject: [PATCH 08/12] again --- src/moscot/datasets.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/moscot/datasets.py b/src/moscot/datasets.py index 38c0cfe8a..25051cf22 100644 --- a/src/moscot/datasets.py +++ b/src/moscot/datasets.py @@ -544,11 +544,9 @@ def _load_dataset_from_url( if type == "h5mu" and not fpath.endswith(".h5mu"): fpath += ".h5mu" from scanpy.readwrite import _check_datafile_present_and_download - if not os.path.exists(fpath) or force_download: _check_datafile_present_and_download(backup_url=backup_url, path=fpath) - else: - data = ad.read_h5ad(filename=fpath, **kwargs) if type == "h5ad" else mu.read_h5mu(filename=fpath, backed=False) + data = ad.read_h5ad(filename=fpath, **kwargs) if type == "h5ad" else mu.read_h5mu(filename=fpath, backed=False) if data.shape != expected_shape: data_str = "MuData" if type == "h5mu" else "AnnData" From 40d7b8ec09e44c9597641a955097fb17f55f7b8f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Sep 2024 12:27:05 +0000 Subject: [PATCH 09/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/moscot/datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/moscot/datasets.py b/src/moscot/datasets.py index 25051cf22..8d6ab4ef7 100644 --- a/src/moscot/datasets.py +++ b/src/moscot/datasets.py @@ -544,6 +544,7 @@ def _load_dataset_from_url( if type == "h5mu" and not fpath.endswith(".h5mu"): fpath += ".h5mu" from scanpy.readwrite import _check_datafile_present_and_download + if not os.path.exists(fpath) or force_download: _check_datafile_present_and_download(backup_url=backup_url, path=fpath) data = ad.read_h5ad(filename=fpath, **kwargs) if type == "h5ad" else mu.read_h5mu(filename=fpath, backed=False) From 0469bc50b84304bf06e0bddb4c4498f6f48ec476 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Mon, 9 Sep 2024 14:50:13 +0200 Subject: [PATCH 10/12] add force_download functionality and avoid using reserved name `type` --- src/moscot/datasets.py | 51 +++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/src/moscot/datasets.py b/src/moscot/datasets.py index 8d6ab4ef7..e1f97c4df 100644 --- a/src/moscot/datasets.py +++ b/src/moscot/datasets.py @@ -16,6 +16,9 @@ from moscot._types import PathLike +from scanpy.readwrite import _check_datafile_present_and_download + + __all__ = [ "mosta", "hspc", @@ -55,7 +58,7 @@ def mosta( """ return _load_dataset_from_url( path, - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/40569779", expected_shape=(54134, 2000), force_download=force_download, @@ -92,7 +95,7 @@ def hspc( """ dataset = _load_dataset_from_url( path, - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/37993503", expected_shape=(4000, 2000), force_download=force_download, @@ -133,7 +136,7 @@ def drosophila( if spatial: return _load_dataset_from_url( path + "_sp.h5ad", - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/37984935", expected_shape=(3039, 82), force_download=force_download, @@ -142,7 +145,7 @@ def drosophila( return _load_dataset_from_url( path + "_sc.h5ad", - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/37984938", expected_shape=(1297, 2000), force_download=force_download, @@ -175,7 +178,7 @@ def c_elegans( """ adata = _load_dataset_from_url( path, - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/39943585", expected_shape=(46151, 20222), force_download=force_download, @@ -213,7 +216,7 @@ def zebrafish( """ adata = _load_dataset_from_url( path, - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/39951073", expected_shape=(44014, 31466), force_download=force_download, @@ -258,7 +261,7 @@ def bone_marrow( if rna: return _load_dataset_from_url( path + "_rna.h5ad", - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/40195114", expected_shape=(6224, 2000), force_download=force_download, @@ -266,7 +269,7 @@ def bone_marrow( ) return _load_dataset_from_url( path + "_atac.h5ad", - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/41013551", expected_shape=(6224, 8000), force_download=force_download, @@ -304,7 +307,7 @@ def pancreas_multiome( if rna_only: return _load_dataset_from_url( path, - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/48785320", expected_shape=(22604, 20242), force_download=force_download, @@ -312,7 +315,7 @@ def pancreas_multiome( ) return _load_dataset_from_url( path, - type="h5mu", + file_type="h5mu", backup_url="https://figshare.com/ndownloader/files/48782332", expected_shape=(22604, 271918), force_download=force_download, @@ -351,7 +354,7 @@ def tedsim( """ return _load_dataset_from_url( path, - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/40178644", expected_shape=(8448, 500), force_download=force_download, @@ -384,7 +387,7 @@ def sciplex( """ return _load_dataset_from_url( path, - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/43381398", expected_shape=(799317, 110984), force_download=force_download, @@ -414,7 +417,7 @@ def sim_align( """ return _load_dataset_from_url( path, - type="h5ad", + file_type="h5ad", backup_url="https://figshare.com/ndownloader/files/37984926", expected_shape=(1200, 500), force_download=force_download, @@ -529,7 +532,7 @@ def simulate_data( def _load_dataset_from_url( fpath: PathLike, - type: Literal["h5ad", "h5mu"], + file_type: Literal["h5ad", "h5mu"], *, backup_url: str, expected_shape: Tuple[int, int], @@ -538,19 +541,17 @@ def _load_dataset_from_url( ) -> Union[ad.AnnData, mu.MuData]: # TODO: make nicer once https://github.com/scverse/mudata/issues/76 resolved fpath = os.path.expanduser(fpath) - if type == "h5ad" and not fpath.endswith(".h5ad"): - fpath += ".h5ad" - - if type == "h5mu" and not fpath.endswith(".h5mu"): - fpath += ".h5mu" - from scanpy.readwrite import _check_datafile_present_and_download - - if not os.path.exists(fpath) or force_download: - _check_datafile_present_and_download(backup_url=backup_url, path=fpath) - data = ad.read_h5ad(filename=fpath, **kwargs) if type == "h5ad" else mu.read_h5mu(filename=fpath, backed=False) + assert file_type in ["h5ad", "h5mu"], f"Invalid type `{file_type}`. Must be one of `['h5ad', 'h5mu']`." + if not fpath.endswith(file_type): + fpath += f".{file_type}" + if force_download and os.path.exists(fpath): + os.remove(fpath) + if not _check_datafile_present_and_download(backup_url=backup_url, path=fpath): + raise FileNotFoundError(f"File `{fpath}` not found or download failed.") + data = ad.read_h5ad(filename=fpath, **kwargs) if file_type == "h5ad" else mu.read_h5mu(filename=fpath, backed=False) if data.shape != expected_shape: - data_str = "MuData" if type == "h5mu" else "AnnData" + data_str = "MuData" if file_type == "h5mu" else "AnnData" raise ValueError(f"Expected {data_str} object to have shape `{expected_shape}`, found `{data.shape}`.") return data From 40e63a4f34f714f27ed80b6ec6b1ca44b8c61e98 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Sep 2024 12:51:33 +0000 Subject: [PATCH 11/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/moscot/datasets.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/moscot/datasets.py b/src/moscot/datasets.py index e1f97c4df..58c1c8870 100644 --- a/src/moscot/datasets.py +++ b/src/moscot/datasets.py @@ -13,11 +13,9 @@ from scipy.linalg import block_diag import anndata as ad - -from moscot._types import PathLike - from scanpy.readwrite import _check_datafile_present_and_download +from moscot._types import PathLike __all__ = [ "mosta", From 73272be9fe2498ecdd8a85e5ddb14cdc28b4a5d3 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Mon, 9 Sep 2024 15:22:16 +0200 Subject: [PATCH 12/12] try this version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 10be37fce..faa5d4b35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,7 @@ dependencies = [ "cloudpickle>=2.2.0", "rich>=13.5", "docstring_inheritance>=2.0.0", - "mudata>=0.2.0" + "mudata>=0.2.2" ] [project.optional-dependencies]