Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make row sum normalization optional for pixel-level Pixie preprocessing #1140

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions src/ark/phenotyping/pixie_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@


def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,
blur_factor=2, subset_proportion=0.1):
rowsum_norm=True, blur_factor=2, subset_proportion=0.1):
"""Preprocess pixel data for one fov

Args:
Expand All @@ -30,6 +30,8 @@ def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,
Array representing segmentation labels for one fov
pixel_thresh_val (float):
value used to determine per-pixel cutoff for total signal inclusion
rowsum_norm (bool):
Whether to row sum normalize each pixel's expression across all channels
blur_factor (int):
The sigma to set for the Gaussian blur
subset_proportion (float):
Expand Down Expand Up @@ -72,7 +74,8 @@ def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,
pixel_mat = pixel_mat.loc[(pixel_mat[channels] != 0).any(axis=1), :].reset_index(drop=True)

# normalize the row sums of pixel mat
pixel_mat = pixel_cluster_utils.normalize_rows(pixel_mat, channels, seg_labels is not None)
if rowsum_norm:
pixel_mat = pixel_cluster_utils.normalize_rows(pixel_mat, channels, seg_labels is not None)

# subset the pixel matrix for training
pixel_mat_subset = pixel_mat.sample(frac=subset_proportion)
Expand All @@ -82,7 +85,7 @@ def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,

def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix,
img_sub_folder, is_mibitiff, channels, blur_factor,
subset_proportion, pixel_thresh_val, seed, channel_norm_df, fov):
subset_proportion, pixel_thresh_val, rowsum_norm, seed, channel_norm_df, fov):
"""Helper function to read in the FOV-level pixel data, run `create_fov_pixel_data`,
and save the preprocessed data.

Expand Down Expand Up @@ -114,6 +117,8 @@ def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix
The proportion of pixels to take from each fov
pixel_thresh_val (float):
The value to normalize the pixels by
rowsum_norm (bool):
Whether to row sum normalize each pixel's expression across all channels
seed (int):
The random seed to set for subsetting
channel_norm_df (pandas.DataFrame):
Expand Down Expand Up @@ -164,7 +169,7 @@ def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix
# create the full and subsetted fov matrices
pixel_mat, pixel_mat_subset = create_fov_pixel_data(
fov=fov, channels=channels, img_data=img_data, seg_labels=seg_labels,
pixel_thresh_val=pixel_thresh_val, blur_factor=blur_factor,
pixel_thresh_val=pixel_thresh_val, rowsum_norm=rowsum_norm, blur_factor=blur_factor,
subset_proportion=subset_proportion
)

Expand Down Expand Up @@ -192,7 +197,7 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
subset_dir='pixel_mat_subsetted',
norm_vals_name_pre_rownorm='channel_norm_pre_rownorm.feather',
norm_vals_name_post_rownorm='channel_norm_post_rownorm.feather',
pixel_thresh_name='pixel_thresh.feather',
pixel_thresh_name='pixel_thresh.feather', rowsum_norm=True,
channel_percentile_pre_rownorm=0.99, channel_percentile_post_rownorm=0.999,
is_mibitiff=False, blur_factor=2, subset_proportion=0.1, seed=42,
multiprocess=False, batch_size=5):
Expand Down Expand Up @@ -233,6 +238,8 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
The name of the file to store the post-pixel-normalized norm values
pixel_thresh_name (str):
The name of the file to store the pixel threshold value
rowsum_norm (bool):
Whether to row sum normalize each pixel's expression across all channels
channel_percentile_pre_rownorm (float):
Percentile used to normalize channels before pixel normalization
channel_percentile_post_rownorm (float):
Expand Down Expand Up @@ -375,7 +382,7 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
fov_data_func = partial(
preprocess_fov, base_dir, tiff_dir, data_dir, subset_dir,
seg_dir, seg_suffix, img_sub_folder, is_mibitiff, channels, blur_factor,
subset_proportion, pixel_thresh_val, seed, channel_norm_pre_rownorm_df
subset_proportion, pixel_thresh_val, rowsum_norm, seed, channel_norm_pre_rownorm_df
)

# define variable to keep track of number of fovs processed
Expand Down
7 changes: 5 additions & 2 deletions templates/2_Pixie_Cluster_Pixels.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@
"Set the following arguments:\n",
"\n",
"* `channels`: channels to run pixel clustering on\n",
"* `rowsum_norm`: whether to normalize the pixel by the sum of all the channels\n",
"* `blur_factor`: sigma (standard deviation) for the Gaussian blur. Higher values are more aggressive in smoothing signal.\n",
"* `subset_proportion`: the fraction of pixels to take from each FOV for training. Sampling is random."
]
Expand All @@ -335,6 +336,7 @@
"channels = [\"CD3\", \"CD4\", \"CD8\", \"CD14\", \"CD20\", \"CD31\", \"CD45\", \"CD68\",\n",
" \"CD163_nuc_exclude\", \"CK17\", \"Collagen1\", \"Fibronectin\", \n",
" \"ECAD_smoothed\", \"HLADR\", \"SMA\", \"Vim\"]\n",
"rowsum_norm = True\n",
"blur_factor = 2\n",
"subset_proportion = 0.1"
]
Expand All @@ -347,7 +349,7 @@
"\n",
"* Gaussian blur each channel separately\n",
"* Remove empty pixels\n",
"* For the remaining pixels, normalize each pixel by the sum of all the channels\n",
"* If set, for the remaining pixels, normalize each pixel by the sum of all the channels\n",
"* Subset a `subset_proportion` fraction of non-empty, normalized pixels. This creates the subsetted dataset for training\n",
"\n",
"Note: if you get integer overflow errors loading in your data, try changing the `dtype` argument to a larger type."
Expand Down Expand Up @@ -376,6 +378,7 @@
" data_dir=pixel_data_dir,\n",
" subset_dir=pixel_subset_dir,\n",
" norm_vals_name_post_rownorm=norm_vals_name,\n",
" rowsum_norm=rowsum_norm,\n",
" blur_factor=blur_factor,\n",
" subset_proportion=subset_proportion,\n",
" multiprocess=multiprocess,\n",
Expand Down Expand Up @@ -928,7 +931,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
"version": "3.11.6"
},
"nbdime-conflicts": {
"local_diff": [
Expand Down
Loading