Skip to content

Commit

Permalink
Added BiocFrame support.
Browse files Browse the repository at this point in the history
  • Loading branch information
Aaron Lun committed Jul 27, 2023
1 parent 16e9119 commit 64a2d94
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 18 deletions.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ install_requires =
importlib-metadata; python_version<"3.8"
mattress
numpy>=1.22.4
BiocFrame

[options.packages.find]
where = src
Expand Down
29 changes: 18 additions & 11 deletions src/scranpy/quality_control/_rna.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Sequence

import numpy as np
from mattress import TatamiNumericPointer, tatamize
from biocframe import BiocFrame

from .._logging import logger
from ..cpphelpers import lib
Expand All @@ -13,7 +12,7 @@


def per_cell_rna_qc_metrics(
x: MatrixTypes, subsets: Sequence = [], num_threads: int = 1, verbose: bool = False
x: MatrixTypes, subsets: dict = {}, num_threads: int = 1, verbose: bool = False
) -> RnaQcResult:
"""Compute qc metrics (RNA).
Expand All @@ -22,16 +21,19 @@ def per_cell_rna_qc_metrics(
Args:
x (MatrixTypes): input matrix.
subsets (Sequence, optional): parameter to specify batches or subsets.
Defaults to [].
subsets (dict, optional): named feature subsets.
Each key is the name of the subset and each value is an array of
integer indices, specifying the rows of `x` belonging to the subset.
Defaults to {}.
num_threads (int, optional): number of threads to use. Defaults to 1.
verbose (bool, optional): display logs?. Defaults to False.
Raises:
TypeError: if x is not an expected matrix type.
Returns:
RnaQcResult: a named tuple with sums, detected and subset proportions.
BiocFrame: data frame containing per-cell count sums, number of detected features
and the proportion of counts in each subset.
"""
if not is_matrix_expected_type(x):
raise TypeError(
Expand All @@ -45,24 +47,25 @@ def per_cell_rna_qc_metrics(
sums = np.ndarray((nc,), dtype=np.float64)
detected = np.ndarray((nc,), dtype=np.int32)

num_subsets = len(subsets)
keys = list(subsets.keys())
num_subsets = len(keys)
subset_in = np.ndarray((num_subsets,), dtype=np.uintp)
subset_out = np.ndarray((num_subsets,), dtype=np.uintp)
collected_in = []
collected_out = []
collected_out = {}

nr = x.nrow()

for i in range(num_subsets):
in_arr = np.ndarray((nr,), dtype=np.uint8)
in_arr.fill(0)
for j in subsets[i]:
for j in subsets[keys[i]]:
in_arr[j] = 1
collected_in.append(in_arr)
subset_in[i] = in_arr.ctypes.data

out_arr = np.ndarray((nc,), dtype=np.float64)
collected_out.append(out_arr)
collected_out[keys[i]] = out_arr
subset_out[i] = out_arr.ctypes.data

if verbose is True:
Expand All @@ -79,4 +82,8 @@ def per_cell_rna_qc_metrics(
num_threads,
)

return RnaQcResult(sums, detected, collected_out)
return BiocFrame({
"sums": sums,
"detected": detected,
"subset_proportions": BiocFrame(collected_out, numberOfRows = nc)
})
22 changes: 15 additions & 7 deletions tests/test_quality_control_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,25 @@

def test_quality_control_numpy():
x = np.random.rand(1000, 100)
result = per_cell_rna_qc_metrics(x, subsets=[[1, 10, 100]])
result = per_cell_rna_qc_metrics(x, subsets={ "foo": [1, 10, 100] })

assert result is not None
assert result.dims[0] == 100
assert isinstance(result, scr.types.RnaQcResult)
assert result.sums is not None
assert result.detected is not None
assert result.subset_proportions is not None
assert result.column("sums") is not None
assert result.column("detected") is not None
assert result.column("subset_proportions") is not None
assert result.column("subset_proportions").column("foo") is not None

# Works without any subsets.
result0 = per_cell_rna_qc_metrics(x)
assert result.column("sums") is not None
assert result.column("detected") is not None
assert result.column("subset_proportions").shape[1] == 0

# Same results when running in parallel.
resultp = per_cell_rna_qc_metrics(x, subsets=[[1, 10, 100]], num_threads = 3)
assert np.array_equal(result["sums"], resultp["sums"])
assert np.array_equal(result["detected"], resultp["detected"])
assert np.array_equal(result["subset_proportions"][0], resultp["subset_proportions"][0])
assert np.array_equal(result.column("sums"), resultp.column("sums"))
assert np.array_equal(result.column("detected"), resultp.column("detected"))
assert np.array_equal(result.column("subset_proportions").column(0), resultp.column("subset_proportions").column(0))

0 comments on commit 64a2d94

Please sign in to comment.