Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Assorted fixes to the initial QC method. #3

Merged
merged 7 commits into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ install_requires =
importlib-metadata; python_version<"3.8"
mattress
numpy>=1.22.4
BiocFrame

[options.packages.find]
where = src
Expand Down
2 changes: 0 additions & 2 deletions src/scranpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,3 @@
__version__ = "unknown"
finally:
del version, PackageNotFoundError

__all__ = ["qc"]
6 changes: 3 additions & 3 deletions src/scranpy/lib/per_cell_rna_qc_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

extern "C" {

void per_cell_rna_qc_metrics(const Mattress* mat, int num_subsets, const uint64_t* subset_ptrs, double* sum_output, int32_t* detected_output, const uint64_t* subset_output, int num_threads) {
void per_cell_rna_qc_metrics(const Mattress* mat, int num_subsets, const uintptr_t* subset_ptrs, double* sum_output, int32_t* detected_output, uintptr_t* subset_output, int num_threads) {
scran::PerCellRnaQcMetrics runner;
runner.set_num_threads(num_threads);

std::vector<const unsigned char*> subsets(num_subsets);
std::vector<const uint8_t*> subsets(num_subsets);
for (int i = 0; i < num_subsets; ++i) {
subsets[i] = reinterpret_cast<const unsigned char*>(subset_ptrs[i]);
subsets[i] = reinterpret_cast<const uint8_t*>(subset_ptrs[i]);
}

scran::PerCellRnaQcMetrics::Buffers<double, int32_t> buffer;
Expand Down
File renamed without changes.
43 changes: 25 additions & 18 deletions src/scranpy/qc/_rna.py → src/scranpy/quality_control/_rna.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,39 @@
from typing import Sequence

import numpy as np
from biocframe import BiocFrame
from mattress import TatamiNumericPointer, tatamize

from .._logging import logger
from ..cpphelpers import lib
from ..types import MatrixTypes, RnaQcResult, is_matrix_expected_type
from ..types import MatrixTypes, is_matrix_expected_type

__author__ = "ltla, jkanche"
__copyright__ = "ltla, jkanche"
__license__ = "MIT"


def per_cell_rna_qc_metrics(
x: MatrixTypes, subsets: Sequence = [], num_threads: int = 1, verbose: bool = False
) -> RnaQcResult:
x: MatrixTypes, subsets: dict = {}, num_threads: int = 1, verbose: bool = False
) -> BiocFrame:
"""Compute qc metrics (RNA).

This function expects the matrix (`x`) to be features (rows) by cells (columns) and
not the other way around!

Args:
x (MatrixTypes): input matrix.
subsets (Sequence, optional): parameter to specify batches or subsets.
Defaults to [].
subsets (dict, optional): named feature subsets.
Each key is the name of the subset and each value is an array of
integer indices, specifying the rows of `x` belonging to the subset.
Defaults to {}.
num_threads (int, optional): number of threads to use. Defaults to 1.
verbose (bool, optional): display logs?. Defaults to False.

Raises:
TypeError: if x is not an expected matrix type.

Returns:
RnaQcResult: a named tuple with sums, detected and subset proportions.
BiocFrame: data frame containing per-cell count sums, number of detected features
and the proportion of counts in each subset.
"""
if not is_matrix_expected_type(x):
raise TypeError(
Expand All @@ -45,24 +47,23 @@ def per_cell_rna_qc_metrics(
sums = np.ndarray((nc,), dtype=np.float64)
detected = np.ndarray((nc,), dtype=np.int32)

num_subsets = len(subsets)
subset_in = np.ndarray((num_subsets,), dtype=np.uint64)
subset_out = np.ndarray((num_subsets,), dtype=np.uint64)
keys = list(subsets.keys())
num_subsets = len(keys)
subset_in = np.ndarray((num_subsets,), dtype=np.uintp)
subset_out = np.ndarray((num_subsets,), dtype=np.uintp)
collected_in = []
collected_out = []
collected_out = {}

nr = x.nrow()

for i in range(num_subsets):
in_arr = np.ndarray((nr,), dtype=np.uint8)
in_arr.fill(0)
for j in subsets[i]:
in_arr[j] = 1
in_arr = np.zeros((nr,), dtype=np.uint8)
in_arr[subsets[keys[i]]] = 1
collected_in.append(in_arr)
subset_in[i] = in_arr.ctypes.data

out_arr = np.ndarray((nc,), dtype=np.float64)
collected_out.append(out_arr)
collected_out[keys[i]] = out_arr
subset_out[i] = out_arr.ctypes.data

if verbose is True:
Expand All @@ -79,4 +80,10 @@ def per_cell_rna_qc_metrics(
num_threads,
)

return RnaQcResult(sums, detected, collected_out)
return BiocFrame(
{
"sums": sums,
"detected": detected,
"subset_proportions": BiocFrame(collected_out, numberOfRows=nc),
}
)
10 changes: 8 additions & 2 deletions src/scranpy/types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from collections import namedtuple
from typing import Any, Union

import numpy as np
Expand All @@ -10,10 +9,17 @@
__license__ = "MIT"

MatrixTypes = Union[TatamiNumericPointer, np.ndarray, sp.spmatrix]
RnaQcResult = namedtuple("RnaQcResult", ["sums", "detected", "subset_proportions"])


def is_matrix_expected_type(x: Any) -> bool:
"""Checks if `x` is an expect matrix type.

Args:
x (Any): any object.

Returns:
bool: True if `x` is supported.
"""
return (
isinstance(x, TatamiNumericPointer)
or isinstance(x, np.ndarray)
Expand Down
18 changes: 0 additions & 18 deletions tests/test_qc_steps.py

This file was deleted.

33 changes: 33 additions & 0 deletions tests/test_quality_control_steps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import numpy as np
from scranpy.quality_control import per_cell_rna_qc_metrics

__author__ = "ltla, jkanche"
__copyright__ = "ltla, jkanche"
__license__ = "MIT"


def test_quality_control_numpy():
x = np.random.rand(1000, 100)
result = per_cell_rna_qc_metrics(x, subsets={"foo": [1, 10, 100]})

assert result is not None
assert result.dims[0] == 100
assert result.column("sums") is not None
assert result.column("detected") is not None
assert result.column("subset_proportions") is not None
assert result.column("subset_proportions").column("foo") is not None

# Works without any subsets.
result0 = per_cell_rna_qc_metrics(x)
assert result0.column("sums") is not None
assert result0.column("detected") is not None
assert result0.column("subset_proportions").shape[1] == 0

# Same results when running in parallel.
resultp = per_cell_rna_qc_metrics(x, subsets={"BAR": [1, 10, 100]}, num_threads=3)
assert np.array_equal(result.column("sums"), resultp.column("sums"))
assert np.array_equal(result.column("detected"), resultp.column("detected"))
assert np.array_equal(
result.column("subset_proportions").column(0),
resultp.column("subset_proportions").column(0),
)
Loading