From 3b011da43103fe2d9137a6431b4ccda2183fb2a4 Mon Sep 17 00:00:00 2001
From: Gregor Sturm <mail@gregor-sturm.de>
Date: Sun, 20 Oct 2024 13:38:15 +0200
Subject: [PATCH 1/4] Add Stephenson datasets

---
 docs/api.rst                    |  1 +
 src/scirpy/datasets/__init__.py | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)
diff --git a/docs/api.rst b/docs/api.rst
index 1c4a41282..ed6363442 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -246,6 +246,7 @@ Example datasets
    datasets.wu2020
    datasets.wu2020_3k
    datasets.maynard2020
+   datasets.stephenson2021_5k
 
 Reference databases
 ^^^^^^^^^^^^^^^^^^^
diff --git a/src/scirpy/datasets/__init__.py b/src/scirpy/datasets/__init__.py
index d262e9ed3..2cbb3778a 100644
--- a/src/scirpy/datasets/__init__.py
+++ b/src/scirpy/datasets/__init__.py
@@ -28,7 +28,7 @@
 
 _FIGSHARE = pooch.create(
     path=pooch.os_cache("scirpy"),
-    base_url="doi:10.6084/m9.figshare.22249894.v1",
+    base_url="doi:10.6084/m9.figshare.22249894.v2",
     version=version("scirpy"),
     version_dev="main",
     env="SCIRPY_DATA_DIR",
@@ -36,6 +36,7 @@
         "wu2020.h5mu": "md5:ed30d9c1c44cae544f4c080a2451118b",
         "wu2020_3k.h5mu": "md5:12c57c790f8a403751304c9de5a18cbf",
         "maynard2020.h5mu": "md5:da64ac62e3e92c80eaf0e8eef6537ac7",
+        "stephenson2021_5k.h5mu": "md5:6ea26f9d95525371ff9028f8e99ed474",
     },
 )
 _POOCH_INFO = dedent(
@@ -124,6 +125,34 @@ def maynard2020() -> MuData:
     return mudata.read_h5mu(fname)
 
 
+@_doc_params(
+    processing_code=indent(_read_to_str(HERE / "_processing_scripts/maynard2020.py"), " " * 8),
+    pooch_info=_POOCH_INFO,
+)
+def stephenson2021_5k() -> MuData:
+    """\
+    Return the dataset from :cite:`Maynard2020` as AnnData object.
+
+    21k cells from NSCLC profiled with Smart-seq2, of which 3,500 have :term:`TCRs<TCR>`
+    and 1,500 have :term:`BCRs<BCR>`.
+
+    {pooch_info}
+
+    The raw FASTQ files have been obtained from `PRJNA591860 <https://www.ebi.ac.uk/ena/browser/view/PRJNA591860>`__
+    and processed using the nf-core `RNA-seq pipeline <https://github.com/nf-core/rnaseq>`_ to obtain
+    gene expression and TraCeR/BraCeR to reconstruct receptors.
+
+    The processed files have been imported and transformed into an :class:`anndata.AnnData`
+    object using the following script:
+
+    .. code-block:: python
+
+        {processing_code}
+    """
+    fname = cast(PathLike, _FIGSHARE.fetch("stephenson2021_5k.h5mu", progressbar=True))
+    return mudata.read_h5mu(fname)
+
+
 def vdjdb(cached: bool = True, *, cache_path="data/vdjdb.h5ad") -> AnnData:
     """\
     Download VDJdb and process it into an AnnData object.

From 2c3e97ab435966da9e92fda8c6fb9a4fb31d1cfb Mon Sep 17 00:00:00 2001
From: Gregor Sturm <mail@gregor-sturm.de>
Date: Sun, 20 Oct 2024 13:39:33 +0200
Subject: [PATCH 2/4] Add changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 948438ac5..95d523534 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning][].
 ### Additions
 
 -   Add a `mask_obs` argument to `tl.clonotype_network` that allows to compute the clonotype networks on a subset of the cells ([#557](https://github.com/scverse/scirpy/pull/557)).
+-   Add `datasets.stephenson2021_5k`, an example dataset for the upcoming BCR tutorial ([#565](https://github.com/scverse/scirpy/pull/565))
 
 ### Fixes
 

From 055e9585f3726e5bda28870c0d735c7d4d18fee7 Mon Sep 17 00:00:00 2001
From: Mario Kanetscheider <m.kanne@ymail.com>
Date: Tue, 22 Oct 2024 13:46:59 +0200
Subject: [PATCH 3/4] Added preprocessing description regarding stephenson_5k
 dataset

---
 src/scirpy/datasets/__init__.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/src/scirpy/datasets/__init__.py b/src/scirpy/datasets/__init__.py
index 2cbb3778a..5a1f966a7 100644
--- a/src/scirpy/datasets/__init__.py
+++ b/src/scirpy/datasets/__init__.py
@@ -131,23 +131,18 @@ def maynard2020() -> MuData:
 )
 def stephenson2021_5k() -> MuData:
     """\
-    Return the dataset from :cite:`Maynard2020` as AnnData object.
-
-    21k cells from NSCLC profiled with Smart-seq2, of which 3,500 have :term:`TCRs<TCR>`
-    and 1,500 have :term:`BCRs<BCR>`.
-
-    {pooch_info}
-
-    The raw FASTQ files have been obtained from `PRJNA591860 <https://www.ebi.ac.uk/ena/browser/view/PRJNA591860>`__
-    and processed using the nf-core `RNA-seq pipeline <https://github.com/nf-core/rnaseq>`_ to obtain
-    gene expression and TraCeR/BraCeR to reconstruct receptors.
+    Return the dataset from :cite:`Stephenson2021` as MuData object, downsampled
+    to 5000 BCR-containing cells.
 
-    The processed files have been imported and transformed into an :class:`anndata.AnnData`
-    object using the following script:
+    The original study sequenced 1,141,860 cells from 143 PBMC samples collected from patients with different severity of COVID-19 and control groups.
+    Gene expression, TCR-enriched and BCR-enriched libraries were prepared for each sample according to 10x Genomics protocol and NovaSeq 6000 was used for sequencing.
 
-    .. code-block:: python
+    A preprocessed dataset for the transciptome library was obtained from `Array Express <https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-10026>`__
+    A preprocessed dataset for the BCR-enriched library was obtained from `clatworthylab's GitHub <https://github.com/clatworthylab/COVID_analysis>`__
+    Both dataset have already passed quality control and all cells that didn't express BCR were discarded.
 
-        {processing_code}
+    To  speed up computation time, we solely included 5 samples from each of the COVID-19-positive groups and randomly subsampled down to a total of 5k cells.
+    
     """
     fname = cast(PathLike, _FIGSHARE.fetch("stephenson2021_5k.h5mu", progressbar=True))
     return mudata.read_h5mu(fname)

From f4666996f9b4b97b77d3ac3930d37690663a9a52 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 22 Oct 2024 12:09:07 +0000
Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/scirpy/datasets/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scirpy/datasets/__init__.py b/src/scirpy/datasets/__init__.py
index 5a1f966a7..d5ee521f8 100644
--- a/src/scirpy/datasets/__init__.py
+++ b/src/scirpy/datasets/__init__.py
@@ -142,7 +142,7 @@ def stephenson2021_5k() -> MuData:
     Both dataset have already passed quality control and all cells that didn't express BCR were discarded.
 
     To  speed up computation time, we solely included 5 samples from each of the COVID-19-positive groups and randomly subsampled down to a total of 5k cells.
-    
+
     """
     fname = cast(PathLike, _FIGSHARE.fetch("stephenson2021_5k.h5mu", progressbar=True))
     return mudata.read_h5mu(fname)