Skip to content

Commit

Permalink
add check that merged count dataset is not empty
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivierCoen committed Jan 31, 2025
1 parent 9f51a32 commit 146991d
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 1,002 deletions.
15 changes: 14 additions & 1 deletion bin/merge_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Written by Olivier Coen. Released under the MIT license.

import argparse
import sys
import polars as pl
from pathlib import Path
import logging
Expand Down Expand Up @@ -98,6 +99,12 @@ def get_counts(files: list[Path]) -> pl.LazyFrame:
lfs = get_valid_lazy_dfs(files)
# joining all count files
merged_lf = reduce(join_dfs, lfs)

# checking if filtered count dataframe is empty
if merged_lf.limit(1).collect().is_empty():
logger.error("No data found in any of the input count datasets...")
sys.exit(100)

# casting count columns to Float64
# casting gene id column to String
count_columns = get_count_columns(merged_lf)
Expand All @@ -112,9 +119,15 @@ def get_counts(files: list[Path]) -> pl.LazyFrame:


def filter_out_genes_not_always_present(count_lf: pl.LazyFrame):
return count_lf.filter(
filtered_count_lf = count_lf.filter(
pl.concat_list(pl.exclude(ENSEMBL_GENE_ID_COLNAME)).list.min() > 0
)
# checking if filtered count dataframe is empty
if filtered_count_lf.limit(1).collect().is_empty():
logger.error("No gene left after filtering for expression > 0 in all samples")
sys.exit(101)

return filtered_count_lf


def export_count_data(filtered_count_lf: pl.LazyFrame):
Expand Down
Loading

0 comments on commit 146991d

Please sign in to comment.