-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathPipeline_benchmarking_script.R
101 lines (87 loc) · 3.39 KB
/
Pipeline_benchmarking_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
install.packages("lobstr")
library(lobstr)
# Defining resources
Cores <- c(3, 5 ,10, 20, 50, 100, 200)
Sample_size <- c(10, 20, 50, 100)
Cores<- c(3, 5 ,10, 20, 50, 100, 200)
Sample_size<- c(1, 2, 3, 5)
setwd("/vast/scratch/users/si.j/susan_fibrosis")
#initial_file_count <- 2
files <- list.files()
store <- "/stornext/General/scratch/GP_Transfer/si.j/store_pipeline_benchmark_fibrosis_all_data_3"
# for (i in initial_file_count:length(files)) {
#
# tar_invalidate(names = everything(), store = store)
#
# # Memory usage before pipeline execution
# if(exists("preprocessed_seurat", envir = globalenv())) {
# mem_before <- obj_size(get("preprocessed_seurat", envir = globalenv()))
# } else {
# mem_before <- 0
# }
for(core in Cores) {
for(sample_size in Sample_size) {
if(length(files) < sample_size) {
break # Break if the sample_size exceeds the available files
}
#setwd("~/HPCell")
tar_invalidate(names = everything(), store = store)
# Select the subset of files to process in this iteration
#file_subset <- files[1:i]
file_subset <- files[1:sample_size]
max_workers <- 100
workers_per_sample <- 4
number_of_samples <- length(file_subset)
#total_workers <- min(number_of_samples * workers_per_sample, max_workers)
total_workers <- min(core * sample_size, length(file_subset))
# Initialize computing resources for all files
computing_resources = crew_controller_slurm(
name = "my_controller",
slurm_memory_gigabytes_per_cpu = 20,
slurm_cpus_per_task = 1,
workers = total_workers,
verbose = FALSE,
seconds_idle = 30
)
# Time and run your pipeline function
time_taken <- system.time({
preprocessed_seurat <- run_targets_pipeline(
input_data = file_subset,
tissue = "pbmc",
computing_resources = computing_resources,
sample_column = "sampleName",
store = store,
input_reference = NULL,
cell_type_annotation_column = "cellAnno"
)
})
# Memory usage after pipeline execution
#mem_after <- obj_size(get("preprocessed_seurat", envir = globalenv()))
#mem_used_this_run <- mem_after - mem_before
# Output the time and memory used for this run
cat("Running with", core, "cores for", sample_size, "samples, using", total_workers, "workers\n")
cat("Sample size:", length(file_subset), "\n",
"Time taken: User time =", time_taken["user.self"],
"System time =", time_taken["sys.self"],
"Elapsed time =", time_taken["elapsed"], "seconds\n")
#"Memory used:", format(mem_used_this_run, units = "Mb"), "\n\n")
}
}
###PLOTTING
library(ggplot2)
# Data
data <- data.frame(
SampleSize = c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
UserTime = c(23.944, 26.71, 36.797, 43.466, 46.846, 54.226, 59.966, 69.946, 76.142, 86.875),
SystemTime = c(5.856, 6.216, 8.143, 9.805, 11.088, 13.302, 15.419, 18.439, 21.315, 23.56),
ElapsedTime = c(278.592, 280.922, 310.607, 306.815, 312.007, 333.49, 333.517, 384.848, 380.266, 396.131)
)
# Melting data for ggplot
data_melted <- reshape2::melt(data, id.vars = "SampleSize")
# Plotting
ggplot(data_melted, aes(x = SampleSize, y = value, colour = variable)) +
geom_line() +
geom_point() +
theme_minimal() +
labs(x = "Sample Size", y = "Time (seconds)", title = "Performance Metrics by Sample Size", color = "Metric") +
scale_colour_manual(values = c("UserTime" = "cornflowerblue", "SystemTime" = "slategrey", "ElapsedTime" = "coral"))