Skip to content

Commit

Permalink
update bulk count
Browse files Browse the repository at this point in the history
  • Loading branch information
pigraul committed Dec 23, 2024
1 parent e7f314b commit 3210a99
Showing 1 changed file with 26 additions and 20 deletions.
46 changes: 26 additions & 20 deletions celescope/bulk_rna/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
random.seed(0)
np.random.seed(0)


class Count(super_count.Count):
"""
## Features
Expand Down Expand Up @@ -42,7 +43,6 @@ def __init__(self, args, display_title=None):
self.gene_cutoff = args.gene_cutoff
self._table_id = self.assay


@utils.add_log
def run(self):
## output exprssion matrix
Expand All @@ -65,7 +65,9 @@ def run(self):
df_bc_valid.to_csv(self.marked_count_file, sep="\t")

## saturation
df_valid = df.reset_index().set_index("Barcode").loc[df_bc_valid.index.to_list()]
df_valid = (
df.reset_index().set_index("Barcode").loc[df_bc_valid.index.to_list()]
)
self.saturation = Count.get_read_saturation(df_valid)
self.downsample_dict = self.downsample(df_valid.reset_index())

Expand All @@ -79,40 +81,37 @@ def get_read_saturation(df_cell):
reads = sum(df_cell["read"])
saturation = 1 - unique / reads
return saturation

@staticmethod
def sub_sample(fraction, df_cell, cell_read_index):
cell_read = df_cell['read'].sum()
cell_read = df_cell["read"].sum()
frac_n_read = int(cell_read * fraction)
subsample_read_index = cell_read_index[:frac_n_read]
index_dedup = np.unique(subsample_read_index, return_counts=False)
# gene median
df_cell_subsample = df_cell.loc[index_dedup, ]
geneNum_median = float(df_cell_subsample.groupby(
'Barcode').agg({ROW: 'nunique'}).median().iloc[0])
df_cell_subsample = df_cell.loc[index_dedup,]
geneNum_median = float(
df_cell_subsample.groupby("Barcode").agg({ROW: "nunique"}).median().iloc[0]
)
return geneNum_median

@utils.add_log
def downsample(self, df_cell):
"""saturation and median gene
return fraction=1 saturation
"""
cell_read_index = np.array(df_cell.index.repeat(df_cell['read']), dtype='int32')
cell_read_index = np.array(df_cell.index.repeat(df_cell["read"]), dtype="int32")
np.random.shuffle(cell_read_index)

format_str = "%.2f\t%.2f\n"
res_dict = {
"fraction": [0.0],
"median_gene": [0]
}
with open(self.downsample_file, 'w') as fh:
fh.write('percent\tmedian_geneNum\n')
res_dict = {"fraction": [0.0], "median_gene": [0]}
with open(self.downsample_file, "w") as fh:
fh.write("percent\tmedian_geneNum\n")
fh.write(format_str % (0, 0))
for fraction in np.arange(0.1, 1.1, 0.1):
geneNum_median = Count.sub_sample(
fraction, df_cell, cell_read_index)
geneNum_median = Count.sub_sample(fraction, df_cell, cell_read_index)
fh.write(format_str % (fraction, geneNum_median))
#def format_float(x): return round(x / 100, 4)
# def format_float(x): return round(x / 100, 4)
res_dict["fraction"].append(round(fraction, 1))
res_dict["median_gene"].append(geneNum_median)

Expand Down Expand Up @@ -143,7 +142,7 @@ def add_count_metrics(self, df):
help_info="",
)
self.add_metric(
name=f"Sequencing saturation",
name="Sequencing saturation",
value=self.saturation,
value_type="fraction",
help_info="the fraction of read originating from an already-observed UMI.",
Expand All @@ -157,8 +156,15 @@ def add_count_metrics(self, df):

def add_plot_data(self):
df_plot = pd.DataFrame(self.downsample_dict)
df_plot = df_plot.rename(columns={"median_gene":"Median Genes","fraction":"Reads Fraction"})
self.add_data(df_line=Line_plot(df_plot, x_title="Reads Fraction", y_title="Median Genes").get_plotly_div())
df_plot = df_plot.rename(
columns={"median_gene": "Median Genes", "fraction": "Reads Fraction"}
)
self.add_data(
df_line=Line_plot(
df_plot, x_title="Reads Fraction", y_title="Median Genes"
).get_plotly_div()
)


def count(args):
with Count(args, display_title="Wells") as runner:
Expand Down

0 comments on commit 3210a99

Please sign in to comment.