diff --git a/.gitignore b/.gitignore index 5e501cbe..406d336a 100644 --- a/.gitignore +++ b/.gitignore @@ -215,3 +215,7 @@ workflow/data/arbigent/scTRIP_segmentation.bed .tests/data_CHR17/RPE-BM510/bam/*.bam.sort .tests/external_data/chr17.fa.log LOGS_DEV/ + +# scTRIP multiplot +workflow/scripts/plotting/scTRIP_multiplot/scTRIPmultiplot +workflow/config/scTRIP_multiplot.ok \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 065f372f..c8e55af5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,13 +9,13 @@ repos: # id: no-commit-to-branch repo: https://github.com/pre-commit/pre-commit-hooks rev: v3.4.0 - - hooks: - - id: snakefmt - repo: https://github.com/snakemake/snakefmt - rev: 0.4.0 - - hooks: - - id: commitizen - stages: - - commit-msg - repo: https://github.com/commitizen-tools/commitizen - rev: v2.17.12 + # - hooks: + # - id: snakefmt + # repo: https://github.com/snakemake/snakefmt + # rev: 0.4.0 + # - hooks: + # - id: commitizen + # stages: + # - commit-msg + # repo: https://github.com/commitizen-tools/commitizen + # rev: v2.17.12 diff --git a/README.md b/README.md index c72e1f12..5119ff39 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ the workflow goes through the following steps: | ![summary](docs/images/figure_pipeline.png) | | :--------------------------------------------------------------------------------: | -| _MosaiCatcher snakemake pipeline_ | +| _MosaiCatcher snakemake pipeline and visualisations examples_ | | _[ashleys-qc-pipeline](https://github.com/friendsofstrandseq/ashleys-qc-pipeline)_ | # 📘 Documentation @@ -49,6 +49,7 @@ the workflow goes through the following steps: - [x] Version synchronisation between ashleys-qc-pipeline and mosaicatcher-pipeline ([1.8.3](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/1.8.3)) - [x] Report captions update ([1.8.5](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/1.8.5)) - [x] Clustering plot (heatmap) & SV calls plot update ([1.8.6](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/1.8.6)) +- [x] [`ashleys_pipeline_only` parameter](/docs/usage.md#usage): using mosaicatcher-pipeline, trigger ashleys-qc-pipeline only and will stop after the generation of the counts, ashleys predictions & plots to allow the user manual reviewing/selection of the cells to be processed ([2.2.0](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/2.2.0)) - [ ] Plotting options (enable/disable segmentation back colors) ## Bioinformatic-related features @@ -59,15 +60,20 @@ the workflow goes through the following steps: - [x] Ploidy detection at the segment and the chromosome level: used to bypass StrandPhaseR if more than half of a chromosome is haploid ([1.7.0](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/1.7.0)) - [x] inpub_bam_legacy mode (bam/selected folders) ([1.8.4](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/1.8.4)) - [x] Blacklist regions files for T2T & hg19 ([1.8.5](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/1.8.5)) -- [x] ArbiGent integration: Strand-Seq based genotyper to study SV containly at least 500bp of uniquely mappable sequence ([1.9.0](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/1.9.0)) -- [x] scNOVA integration: Strand-Seq Single-Cell Nucleosome Occupancy and genetic Variation Analysis ([1.9.2](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/1.9.2)) +- [x] [ArbiGent](/docs/usage.md#arbigent-mode-of-execution) integration: Strand-Seq based genotyper to study SV containly at least 500bp of uniquely mappable sequence ([1.9.0](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/1.9.0)) +- [x] [scNOVA](/docs/usage.md#scnova-mode-of-execution) integration: Strand-Seq Single-Cell Nucleosome Occupancy and genetic Variation Analysis ([1.9.2](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/1.9.2)) +- [x] [`multistep_normalisation` and `multistep_normalisation_for_SV_calling` parameters](/docs/usage.md#multistep-normalisation) to replace GC analysis module (library size normalisation, GC correction, Variance Stabilising Transformation) ([2.1.1](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/2.1.1)) +- [x] Strand-Seq processing based on mm10 assembly ([2.1.2](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/2.1.2)) +- [x] UCSC ready to use file generation including counts & SV calls ([2.1.2](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/2.1.2)) +- [x] `blacklist_regions` parameter: ([2.2.0](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/2.2.0)) +- [x] IGV ready to use XML session generation: ([2.2.2](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/2.2.2)) - [ ] Pooled samples ## Small issues to fix -- [ ] Move pysam / SM tag comparison script to snakemake rule - [x] replace `input_bam_location` by `data_location` (harmonization with [ashleys-qc-pipeline](https://github.com/friendsofstrandseq/ashleys-qc-pipeline.git)) - [x] List of commands available through list_commands parameter ([1.8.6](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/1.8.6) +- [x] Move pysam / SM tag comparison script to snakemake rule ([2.2.0](https://github.com/friendsofstrandseq/mosaicatcher-pipeline/releases/tag/2.2.0)) # 🛑 Troubleshooting & Current limitations @@ -103,3 +109,5 @@ the workflow goes through the following steps: > ArbiGent publication: Porubsky, David, Wolfram Höps, Hufsah Ashraf, PingHsun Hsieh, Bernardo Rodriguez-Martin, Feyza Yilmaz, Jana Ebler, et al. 2022. “Recurrent Inversion Polymorphisms in Humans Associate with Genetic Instability and Genomic Disorders.” Cell 185 (11): 1986-2005.e26. https://doi.org/10.1016/j.cell.2022.04.017. > scNOVA publication: Jeong, Hyobin, Karen Grimes, Kerstin K. Rauwolf, Peter-Martin Bruch, Tobias Rausch, Patrick Hasenfeld, Eva Benito, et al. 2022. “Functional Analysis of Structural Variants in Single Cells Using Strand-Seq.” Nature Biotechnology, November, 1–13. https://doi.org/10.1038/s41587-022-01551-4. + +> scNOVA publication: Jeong, Hyobin, Karen Grimes, Kerstin K. Rauwolf, Peter-Martin Bruch, Tobias Rausch, Patrick Hasenfeld, Eva Benito, et al. 2022. “Functional Analysis of Structural Variants in Single Cells Using Strand-Seq.” Nature Biotechnology, November, 1–13. https://doi.org/10.1038/s41587-022-01551-4. diff --git a/afac/alice_binning.ipynb b/afac/alice_binning.ipynb new file mode 100644 index 00000000..b0eae79d --- /dev/null +++ b/afac/alice_binning.ipynb @@ -0,0 +1,714 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chrstartendvaluestrand
0chr10488874902242.01
1chr10492774933731.81
2chr10642176453247.51
3chr10703377047245.21
4chr10793977950242.11
..................
217741chrY258059422580598763.61
217742chrY264083452640848054.21
217743chrY266531812665321135.11
217744chrY568771735687729345.11
217745chrY568828035688292336.11
\n", + "

217746 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " chr start end value strand\n", + "0 chr10 48887 49022 42.0 1\n", + "1 chr10 49277 49337 31.8 1\n", + "2 chr10 64217 64532 47.5 1\n", + "3 chr10 70337 70472 45.2 1\n", + "4 chr10 79397 79502 42.1 1\n", + "... ... ... ... ... ...\n", + "217741 chrY 25805942 25805987 63.6 1\n", + "217742 chrY 26408345 26408480 54.2 1\n", + "217743 chrY 26653181 26653211 35.1 1\n", + "217744 chrY 56877173 56877293 45.1 1\n", + "217745 chrY 56882803 56882923 36.1 1\n", + "\n", + "[217746 rows x 5 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"/scratch/tweber/TMP/ALICE/G4.csv\", sep=\",\")\n", + "df = df.drop(df.columns[0], axis=1)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0chromstartendsamplecellwctot_countclasstot_count_G4_quadruplexG4_norm
00chr10200000LanexATRXKOMNILanexATRXKOMNIPE203012.7016942.7016945.403388WC19.00.095
11chr1200000400000LanexATRXKOMNILanexATRXKOMNIPE203019.9197329.69428319.614015WC8.00.040
22chr1400000600000LanexATRXKOMNILanexATRXKOMNIPE203018.15782911.78353119.941360WC24.00.120
33chr1600000800000LanexATRXKOMNILanexATRXKOMNIPE2030111.5806188.83267520.413292WC39.00.195
44chr18000001000000LanexATRXKOMNILanexATRXKOMNIPE203017.4057689.66864217.074410WC169.00.845
.......................................
11853671185367chrY6160000061800000LanexATRXKOMNILanexATRXKOMNIPE203950.0000000.0000000.000000WCNaNNaN
11853681185368chrY6180000062000000LanexATRXKOMNILanexATRXKOMNIPE203950.0000000.0000000.000000WCNaNNaN
11853691185369chrY6200000062200000LanexATRXKOMNILanexATRXKOMNIPE203953.3035560.0000003.303556WCNaNNaN
11853701185370chrY6220000062400000LanexATRXKOMNILanexATRXKOMNIPE203958.3388900.0000008.338890WCNaNNaN
11853711185371chrY6240000062460029LanexATRXKOMNILanexATRXKOMNIPE203950.0000000.0000000.000000WCNaNNaN
\n", + "

1185372 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 chrom start end sample \\\n", + "0 0 chr1 0 200000 LanexATRXKOMNI \n", + "1 1 chr1 200000 400000 LanexATRXKOMNI \n", + "2 2 chr1 400000 600000 LanexATRXKOMNI \n", + "3 3 chr1 600000 800000 LanexATRXKOMNI \n", + "4 4 chr1 800000 1000000 LanexATRXKOMNI \n", + "... ... ... ... ... ... \n", + "1185367 1185367 chrY 61600000 61800000 LanexATRXKOMNI \n", + "1185368 1185368 chrY 61800000 62000000 LanexATRXKOMNI \n", + "1185369 1185369 chrY 62000000 62200000 LanexATRXKOMNI \n", + "1185370 1185370 chrY 62200000 62400000 LanexATRXKOMNI \n", + "1185371 1185371 chrY 62400000 62460029 LanexATRXKOMNI \n", + "\n", + " cell w c tot_count class \\\n", + "0 LanexATRXKOMNIPE20301 2.701694 2.701694 5.403388 WC \n", + "1 LanexATRXKOMNIPE20301 9.919732 9.694283 19.614015 WC \n", + "2 LanexATRXKOMNIPE20301 8.157829 11.783531 19.941360 WC \n", + "3 LanexATRXKOMNIPE20301 11.580618 8.832675 20.413292 WC \n", + "4 LanexATRXKOMNIPE20301 7.405768 9.668642 17.074410 WC \n", + "... ... ... ... ... ... \n", + "1185367 LanexATRXKOMNIPE20395 0.000000 0.000000 0.000000 WC \n", + "1185368 LanexATRXKOMNIPE20395 0.000000 0.000000 0.000000 WC \n", + "1185369 LanexATRXKOMNIPE20395 3.303556 0.000000 3.303556 WC \n", + "1185370 LanexATRXKOMNIPE20395 8.338890 0.000000 8.338890 WC \n", + "1185371 LanexATRXKOMNIPE20395 0.000000 0.000000 0.000000 WC \n", + "\n", + " tot_count_G4_quadruplex G4_norm \n", + "0 19.0 0.095 \n", + "1 8.0 0.040 \n", + "2 24.0 0.120 \n", + "3 39.0 0.195 \n", + "4 169.0 0.845 \n", + "... ... ... \n", + "1185367 NaN NaN \n", + "1185368 NaN NaN \n", + "1185369 NaN NaN \n", + "1185370 NaN NaN \n", + "1185371 NaN NaN \n", + "\n", + "[1185372 rows x 12 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "counts_file = pd.read_csv(\"/scratch/tweber/TMP/ALICE/c_G4.csv\")\n", + "counts_file" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "chrom\n", + "chr1 248387328\n", + "chr10 134758134\n", + "chr11 135127769\n", + "chr12 133324548\n", + "chr13 113566686\n", + "chr14 101161492\n", + "chr15 99753195\n", + "chr16 96330374\n", + "chr17 84276897\n", + "chr18 80542538\n", + "chr19 61707364\n", + "chr2 242696752\n", + "chr20 66210255\n", + "chr21 45090682\n", + "chr22 51324926\n", + "chr3 201105948\n", + "chr4 193574945\n", + "chr5 182045439\n", + "chr6 172126628\n", + "chr7 160567428\n", + "chr8 146259331\n", + "chr9 150617247\n", + "chrX 154259566\n", + "chrY 62460029\n", + "Name: end, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "counts_file.groupby(\"chrom\")[\"end\"].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_48491/620205983.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " bin_df[\"value\"] = 0\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromstartendvalue
0chr102000000
1chr12000004000000
2chr14000006000000
3chr16000008000000
4chr180000010000000
...............
1185367chrY61600000618000000
1185368chrY61800000620000000
1185369chrY62000000622000000
1185370chrY62200000624000000
1185371chrY62400000624600290
\n", + "

1185372 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " chrom start end value\n", + "0 chr1 0 200000 0\n", + "1 chr1 200000 400000 0\n", + "2 chr1 400000 600000 0\n", + "3 chr1 600000 800000 0\n", + "4 chr1 800000 1000000 0\n", + "... ... ... ... ...\n", + "1185367 chrY 61600000 61800000 0\n", + "1185368 chrY 61800000 62000000 0\n", + "1185369 chrY 62000000 62200000 0\n", + "1185370 chrY 62200000 62400000 0\n", + "1185371 chrY 62400000 62460029 0\n", + "\n", + "[1185372 rows x 4 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bin_df = counts_file[[\"chrom\", \"start\", \"end\"]]\n", + "bin_df[\"value\"] = 0\n", + "bin_df" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_48491/1221284625.py:5: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_lite[\"bin\"] = pd.cut(x=df_lite[\"start\"].values, bins=[0]+bin_df[\"end\"].unique().tolist())\n" + ] + }, + { + "data": { + "text/plain": [ + "bin\n", + "(0, 200000] 0.0\n", + "(200000, 400000] 0.0\n", + "(400000, 600000] 0.0\n", + "(600000, 800000] 0.0\n", + "(800000, 1000000] 0.0\n", + " ... \n", + "(44200000, 44400000] 1548.4\n", + "(44400000, 44600000] 1673.8\n", + "(44600000, 44800000] 2139.1\n", + "(44800000, 45000000] 2336.0\n", + "(45000000, 45090682] 666.2\n", + "Name: value, Length: 226, dtype: float64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bin_df = bin_df.loc[bin_df[\"chrom\"] == \"chr21\"]\n", + "bin_df\n", + "\n", + "df_lite = df.loc[df[\"chr\"] == \"chr21\"]\n", + "df_lite[\"bin\"] = pd.cut(x=df_lite[\"start\"].values, bins=[0]+bin_df[\"end\"].unique().tolist())\n", + "\n", + "df_lite.groupby(\"bin\")[\"value\"].sum()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_lite" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "notebook", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/config/config.yaml b/config/config.yaml index 78dc3d51..5a3b5098 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -3,10 +3,10 @@ # -------------------------------------------------------- # MosaiCatcher version -version: 2.2.1 +version: 2.2.2 # Ashleys-QC pipeline version -ashleys_pipeline_version: 2.2.1 +ashleys_pipeline_version: 2.2.2 # Email for notifications about the pipeline's status email: "" @@ -73,6 +73,7 @@ references_data: R_reference: "BSgenome.Hsapiens.UCSC.hg19" segdups: "workflow/data/segdups/segDups_hg19_UCSCtrack.bed.gz" snv_sites_to_genotype: "" + # snv_sites_to_genotype: "/g/korbel2/weber/MosaiCatcher_files/snv_sites_to_genotype/ALL.chr1-22plusXY_hg19_sites.20170504.renamedCHR.vcf.gz" reference_file_location: https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/analysisSet/hg19.p13.plusMT.no_alt_analysis_set.fa.gz "T2T": reference_fasta: "workflow/data/ref_genomes/T2T.fa" @@ -181,6 +182,11 @@ additional_sce_cutoff: 20000000 # SCE min distance sce_min_distance: 500000 +# -------------------------------------------------------- +# Downstream modules configuration - genome browsing +# -------------------------------------------------------- +genome_browsing_files_generation: False + # -------------------------------------------------------- # Downstream modules configuration - ArbiGent # -------------------------------------------------------- @@ -235,6 +241,14 @@ plottype_clustering: - "position" - "chromosome" +scTRIP_multiplot: False + +# -------------------------------------------------------- +# StrandScape +# -------------------------------------------------------- + +use_strandscape_labels: False + # -------------------------------------------------------- # Internal Parameters # -------------------------------------------------------- diff --git a/config/config_metadata.yaml b/config/config_metadata.yaml index 054f84a5..97c1af02 100644 --- a/config/config_metadata.yaml +++ b/config/config_metadata.yaml @@ -111,9 +111,27 @@ scNOVA: required: False default: False lint_check: False +genome_browsing_files_generation: + desc: Enable genome browsing files generation (UCSC + IGV) + type: bool + required: False + default: False + lint_check: False genecore_prefix: desc: "" type: str required: False default: "/g/korbel/shared/genecore" lint_check: False +scTRIP_multiplot: + desc: "Enable scTRIP multiplot (W/C, depth, phased het SNPs, SV) for all chrom of all cells for every sample" + type: bool + required: False + default: "/g/korbel/shared/genecore" + lint_check: False +use_strandscape_labels:: + desc: "Use StrandScape labels instead of cell_selection/labels.tsv" + type: bool + required: False + default: False + lint_check: False diff --git a/docs/output.md b/docs/output.md index 98d1f003..9a1ef826 100644 --- a/docs/output.md +++ b/docs/output.md @@ -28,7 +28,7 @@ as the distribution of total number of reads per cell, duplication rate, or excl Afterwards, every pages show the overview of binning count result of each of the single-cells as presented below. The depth of Crick reads are depicted in the green color in the right side, and the depth of Watson reads are depicted in the orange color in the left side of each chromosome lines. HMM automatically defines the WW/WC/CC status according the reads distribution (yellow background: WC, green background: CC, orange background: WW). -| ![summary](images/plots/414.png) | +| ![summary](images/plots/correct.png) | | :----------------------------------: | | _Strand-seq karyotype visualisation_ | @@ -128,6 +128,32 @@ By using these heatmaps, the user can easily identify subclones based on the SV | :----------------------------------------: | | _2. SV clustering (SV type)_ | +### Genome browsing + +File path: `//plots/UCSC|IGV` + +You can now also generates UCSC and IGV genome-browsing ready-to-be-used files when `genome_browsing_files_generation=True`. Here is an example below: + +| ![summary](images/plots/genome_browsing.png) | +| :--------------------------------------------------------------------------: | +| _(A) UCSC and (B) IGV Genome browsing using files generated by MosaiCatcher_ | + +#### UCSC genome browser + +To visualise your data on the UCSC genome browser, go to the [website](https://genome.ucsc.edu/cgi-bin/hgTracks), section `My Data/Custom Tracks`, and upload the .gz file generated by MosaiCatcher located in `//plots/UCSC/.bedUCSC.gz`. Then click on the `submit` button, wait until the loading complete. You should see the list of tracks loaded. + +#### IGV + +MosaiCatcher generates an XML session ready to use by IGV. Thus, files in this XML session are referenced using relative path. It's thus important to either mount the disk where is present the data on your computer, or to copy the complete `//plots/IGV/` on your computer. Once this is done, open the software, click on `File/Open Session` and load the XML file present in the IGV folder. Please note that the SV calls coloring appears only once you are displaying the data at the chromosome level, not at the genome level. + +### scTRIP multiplot (Marco Cosenza) + +From 2.2.2, it's now possible to use [scTRIP multiplot](/docs/usage.md#sctrip-multiplot) inside MosaiCatcher. By enabling the option available in the configuration (`scTRIP_multiplot=True`), you will obtain for each chromosome of each cell able to be processed, a plot similar to below: + +| ![summary](images/plots/scTRIP_multiplot.png) | +| :---------------------------------------------------------------------: | +| _scTRIP multiplot (Watson/Crick, depth, haplotype phased and SV calls)_ | + ## Statistics --- diff --git a/docs/parameters.md b/docs/parameters.md index 4cec6b84..de7c7258 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -29,15 +29,15 @@ All these arguments can be specified in two ways: ### Ashleys-QC upstream pipeline -| Parameter | Comment | Parameter type | Default | -| ----------------------- | ----------------------------------------------------------------------------------------------------------------- | -------------- | ------- | -| `input_bam_legacy` | Mutualy exclusive with ashleys_pipeline. Will use `selected` folder to identify high-quality libraries to process | Boolean | False | -| `ashleys_pipeline` | Allow to load and use ashleys-qc-pipeline snakemake preprocessing module and to start from FASTQ inputs | Boolean | False | -| `ashleys_pipeline_only` | Stop the execution after ashleys-qc-pipeline submodule. Requires `ashleys_pipeline` to be True | Boolean | False | -| `ashleys_threshold` | Threshold for Ashleys-qc binary classification | Float | 0.5 | -| `MultiQC` | Enable or disable MultiQC analysis (includes FastQC, samtools flagstats & idxstats) | Boolean | False | -| `hand_selection` | Enable or disable hand selection through the Jupyter Notebook | Boolean | False | -| `split_qc_plot` | Enable or disable the split of QC plot into individual pages plots | Boolean | False | +| Parameter | Comment | Parameter type | Default | +| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------- | +| `input_bam_legacy` | Mutualy exclusive with ashleys_pipeline. Will use `selected` folder to identify high-quality libraries to process | Boolean | False | +| `ashleys_pipeline` | Allow to load and use [ashleys-qc-pipeline](/docs/usage.md#3c-fastq-input--preprocessing-module) snakemake preprocessing module and to start from FASTQ inputs | Boolean | False | +| `ashleys_pipeline_only` | Stop the execution after ashleys-qc-pipeline submodule. Requires `ashleys_pipeline` to be True | Boolean | False | +| `ashleys_threshold` | Threshold for Ashleys-qc binary classification | Float | 0.5 | +| `MultiQC` | Enable or disable MultiQC analysis (includes FastQC, samtools flagstats & idxstats) | Boolean | False | +| `hand_selection` | Enable or disable hand selection through the Jupyter Notebook | Boolean | False | +| `split_qc_plot` | Enable or disable the split of QC plot into individual pages plots | Boolean | False | ### Reference data & Chromosomes @@ -49,11 +49,11 @@ All these arguments can be specified in two ways: ### Counts configuration -| Parameter | Comment | Default | -| ---------------------------------- | --------------------------------------------------------------------------------------------------- | ------- | -| `multistep_normalisation_analysis` | Allow to perform multistep normalisation including GC correction for visualization (Marco Cosenza). | False | -| `window` | Window size used for binning by mosaic count (Can be of high importance regarding library coverage) | 100000 | -| `blacklist_regions` | Enable/Disable blacklisting | True | +| Parameter | Comment | Default | +| ------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | ------- | +| `multistep_normalisation` | Allow to perform [multistep normalisation](/docs/usage.md#multistep-normalisation) including GC correction for visualization (Marco Cosenza). | False | +| `window` | Window size used for binning by mosaic count (Can be of high importance regarding library coverage) | 100000 | +| `blacklist_regions` | Enable/Disable blacklisting | True | ### SV calling parameters @@ -74,11 +74,12 @@ All these arguments can be specified in two ways: ### Downstream analysis -| Parameter | Comment | Default | -| ------------------- | ----------------------------------------------------------------------------------- | ------- | -| `arbigent` | Enable ArbiGent mode of execution to genotype SV based on arbitrary segments | False | -| `arbigent_bed_file` | Allow to specify custom ArbiGent BED file | "" | -| `scNOVA` | Enable scNOVA mode of execution to compute Nucleosome Occupancy (NO) of detected SV | False | +| Parameter | Comment | Default | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------ | ------- | +| `arbigent` | Enable [ArbiGent](/docs/usage.md#arbigent-mode-of-execution) mode of execution to genotype SV based on arbitrary segments | False | +| `arbigent_bed_file` | Allow to specify custom ArbiGent BED file | "" | +| `scNOVA` | Enable [scNOVA](/docs/usage.md#scnova-mode-of-execution) mode of execution to compute Nucleosome Occupancy (NO) of detected SV | False | +| `scTRIP_multiplot` | Enable [scTRIP multiplot](/docs/usage.md#sctrip-multiplot) generation for all chromosomes of all cells | False | ### EMBL specific options diff --git a/docs/usage.md b/docs/usage.md index 75935aed..96dbf017 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -14,6 +14,15 @@ From 2.2.0, you don't need to clone both [ashleys-qc-pipeline preprocessing modu 1. A. Create a dedicated conda environment +--- + +**ℹ️ Note** + +- Please be careful of your conda/mamba setup, if you applied specific constraints/modifications to your system, this could lead to some versions discrepancies. +- mamba is usually preferred but might not be installed by default on a shared cluster environment + +--- + ```bash conda create -n snakemake -c bioconda -c conda-forge -c defaults -c anaconda snakemake ``` @@ -24,7 +33,7 @@ conda create -n snakemake -c bioconda -c conda-forge -c defaults -c anaconda sna conda activate snakemake ``` -**Reminder:** You will need to verify that this conda environment is activated and provide the right snakemake before each execution (`which snakemake` command should output like //[ana|mini]conda3/envs/snakemake/bin/snakemake) +**Reminder:** You will need to verify that this conda environment is activated and provide the right snakemake before each execution (`which snakemake` command should output like \/\/[ana|mini]conda3/envs/snakemake/bin/snakemake) 2. Clone the repository @@ -81,8 +90,7 @@ snakemake \ **ℹ️ Note for 🇪🇺 EMBL users** -- You can load already installed snakemake modules on the HPC (by connecting to login01 & login02) using the following `module load snakemake/7.14.0-foss-2022a` -- Use the following command for singularity-args parameter: `--singularity-args "-B /g:/g -B /scratch:/scratch"` +- Use the following profile to run on EMBL cluster: `--profile workflow/snakemake_profiles/HPC/slurm_EMBL` --- @@ -202,7 +210,7 @@ If possible, it is also highly recommended to install and use `mamba` package ma ```bash conda install -c conda-forge mamba -mamba create -n snakemake -c bioconda -c conda-forge -c defaults -c anaconda snakemake=7.14.0 +mamba create -n snakemake -c bioconda -c conda-forge -c defaults -c anaconda snakemake conda activate mosaicatcher_env ``` @@ -297,19 +305,15 @@ Parent_folder `-- selected |-- Cell_03.sort.mdup.bam `-- Cell_04.sort.mdup.bam - - - - ``` -> Using the `old behavior`, cells flagged as low-quality will be determined both based on their presence in the `selected` folder presented above and on coverage [see Note here](#note:-filtering-of-low-quality-cells-impossible-to-process). +> Using the `input_bam_legacy` parameter, cells flagged as low-quality will be determined both based on their presence in the `selected` folder presented above and on coverage [see Note here](#note:-filtering-of-low-quality-cells-impossible-to-process). --- **⚠️ Warning** -Using the `old behavior`, only **intersection** between cells present in the selected folder and with enough coverage will be kept. Example: if a library is present in the selected folder but present a low coverage [see Note here](#note:-filtering-of-low-quality-cells-impossible-to-process), this will not be processed. +Using the `input_bam_legacy` parameter, only **intersection** between cells present in the selected folder and with enough coverage will be kept. Example: if a library is present in the selected folder but present a low coverage [see Note here](#note:-filtering-of-low-quality-cells-impossible-to-process), this will not be processed. --- @@ -386,7 +390,7 @@ snakemake \ **ℹ️ Note** It is possible to provide multiple mouting points between system and cointainer using as many `-B` as needed in the `singularity-args` command like the following: "-B /:/ -B /:/" -For EMBL users, this can be for example "-B /g:/g -B /scratch:/scratch" +For EMBL users, you don't need to specify this as this is already part of the execution profile (workflow/snakemake_profiles/HPC/slurm_EMBL) --- @@ -530,6 +534,12 @@ snakemake \ ``` +## scTRIP multiplot + +From 2.2.2, scTRIP multiplot (from Marco Cosenza) is now compatible with MosaiCatcher. The single requirement is to clone scTRIP multiplot repository (please reach out Marco if you want to access the repository, currently private) inside `workflow/scripts/plotting/scTRIP_multiplot`. + +By default, scTRIP multiplot is set to false, to enable it, please update the `config/config.yaml` file or use `scTRIP_multiplot=True` in the config section of the command line. An example of a scTRIP multiplot is available [here](/docs/output.md#sctrip-multiplot-marco-cosenza) + --- **ℹ️ Note** @@ -546,9 +556,9 @@ If you already use a previous version of mosaicatcher-pipeline, here is a short `git fetch --all` -- Jump to a new version (for example 2.1.0) & pull code: +- Jump to a new version (for example 2.2.2) & pull code: -`git checkout 2.1.0 && git pull` +`git checkout 2.2.2 && git pull` Then, to initiate or update git snakemake_profiles submodule: diff --git a/github-actions-runner/docker_procedure.md b/github-actions-runner/docker_procedure.md index f3ea5c61..0fc6d2ad 100644 --- a/github-actions-runner/docker_procedure.md +++ b/github-actions-runner/docker_procedure.md @@ -46,4 +46,4 @@ snakemake --configfile .tests/config/simple_config.yaml --config ashleys_pipelin docker login -u weber8thomas docker build --platform=linux/amd64 -t weber8thomas/mosaicatcher-pipeline:VERSION . -docker push -t weber8thomas/mosaicatcher-pipeline:VERSION +docker push weber8thomas/mosaicatcher-pipeline:VERSION diff --git a/watchdog_pipeline/watchdog_pipeline.py b/watchdog_pipeline/watchdog_pipeline.py new file mode 100644 index 00000000..d3df33c6 --- /dev/null +++ b/watchdog_pipeline/watchdog_pipeline.py @@ -0,0 +1,319 @@ +import time +import os, sys, glob, subprocess, re +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler +from datetime import datetime +import logging +import json +import pandas as pd +import threading + + +os.makedirs("watchdog/logs", exist_ok=True) + +# Setup the logger +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[ + logging.FileHandler("watchdog/logs/watchdog_ashleys.log"), # File handler to log to a file + logging.StreamHandler(), # Stream handler to log to the console + ], +) + + +# Set the path you want to watch +path_to_watch = sys.argv[1] + +data_location = "/scratch/tweber/DATA/MC_DATA/STOCKS_DEV" +publishdir_location = "/g/korbel/weber/TMP/WORKFLOW_RESULTS_DEV" +# publishdir_location = "/g/korbel/WORKFLOW_RESULTS" +genecore_prefix = path_to_watch +profile_slurm = ["--profile", "workflow/snakemake_profiles/HPC/slurm_EMBL/"] +profile_dry_run = ["--profile", "workflow/snakemake_profiles/local/conda/", "-c", "1"] +dry_run_options = ["-n", "-q"] +# snakemake_binary = "/g/korbel2/weber/miniconda3/envs/snakemake_latest/bin/snakemake" +snakemake_binary = "/g/korbel2/weber/miniconda3/envs/snakemake_panoptesfix/bin/snakemake" + + +# plates_processing_status = pd.read_csv("watchdog/processing_status.json", sep="\t") +# print(plates_processing_status) + + +# Define the event handler +class MyHandler(FileSystemEventHandler): + def on_created(self, event): + if event.is_directory: # if a directory is created + logging.info(f"Directory {event.src_path} has been created!") + self.process_new_directory(event.src_path) + + def check_unprocessed_folder(self): + unwanted = ["._.DS_Store", ".DS_Store", "config"] + list_runs_processed = sorted([e for e in os.listdir(data_location) if e not in unwanted]) + total_list_runs = sorted([e for e in os.listdir(path_to_watch) if e not in unwanted]) + unprocessed_plates = set(total_list_runs).difference(list_runs_processed) + print(list_runs_processed) + print(total_list_runs) + print(unprocessed_plates) + # for plate in ["2023-07-10-HLGVJAFX5"]: + for plate in unprocessed_plates: + # if plate not in plates_processing_status["plate"].values.tolist(): + # plates_processing_status_plate_dict = collections.defaultdict(dict) + nb_txt_gz_files = len(glob.glob(f"{path_to_watch}/{plate}/*.txt.gz")) + # if nb_txt_gz_files == 576: + # if (nb_txt_gz_files % 192) == 0: + print(f"PROCESSING {path_to_watch}/{plate}") + self.process_new_directory(f"{path_to_watch}/{plate}") + # else: + # print(f"Not possible to process {path_to_watch}/{plate}, containing {nb_txt_gz_files} txt.gz files") + + def process_new_directory(self, directory_path): + """Process the new directory, check for .txt.gz files and execute snakemake command if conditions are met.""" + + # Poll the directory until 576 files appear or a timeout is reached + timeout = 60 # Timeout in seconds + start_time = time.time() + + # while True: + # Count the number of .txt.gz files in the new directory + txt_gz_files = glob.glob(directory_path + "/*.txt.gz") + num_files = len(txt_gz_files) + + # # If the desired number of files is found or timeout is reached, break the loop + # if (num_files % 192) == 0 or time.time() - start_time > timeout: + # break + + # # Sleep for a while before the next poll + # time.sleep(5) # Sleep for 5 seconds + + # Process the found .txt.gz files + self.process_txt_gz_files(directory_path, txt_gz_files, num_files) + + def process_txt_gz_files(self, directory_path, txt_gz_files, num_files): + """Process the found .txt.gz files and execute snakemake command if conditions are met.""" + + if (num_files % 192) == 0: + logging.info(f"The new directory contains exactly {num_files} .txt.gz files.") + self.execute_snakemake(directory_path, txt_gz_files) + + else: + logging.info(f"The new directory contains {str(num_files)} .txt.gz files, not 576.") + + def execute_snakemake(self, directory_path, txt_gz_files): + """Execute the snakemake command based on the found prefixes.""" + pattern = re.compile(r"_lane1(.*?)(iTRU|PE20)(.*?)([A-H]?)(\d{2})(?:_1_|_2_)") + prefixes = list() + + for file_path in sorted(txt_gz_files): + match = pattern.search(file_path) + # print(file_path, match) + if match: + prefix = match.group(2) + # print(sample_name) + # prefix = match.group(2) + match.group(4) + match.group(5) # Concatenate the prefix, optional letter, and two digits + prefixes.append(prefix) + # indexes.add(index) + # pattern = re.compile(r"(iTRU|PE20)\d{3}") + # prefixes = set() + # + # for file_path in txt_gz_files: + # match = pattern.search(file_path) + # print(file_path) + # if match: + # prefix = match.group()[:4] # Get the first 4 characters, which is the prefix + # prefixes.add(prefix) + + if len(set(prefixes)) > 1: + logging.info("Multiple different prefixes found: %s", prefixes) + elif prefixes: + for j, file_path in enumerate(sorted(txt_gz_files)): + if (j + 1) % 192 == 0: + match = pattern.search(file_path) + sample_name = match.group(1) + cell = f"{sample_name}{prefixes[0]}{match.group(3)}{match.group(4)}96" + # print(file_path, j, match, sample_name, cell) + # print([match.group(i) for i in range(6)]) + # self.execute_command(directory_path, prefixes[0], sample_name) + + # Debug/dev purpose - target a specific file + self.execute_command(directory_path, prefixes[0], sample_name, cell) + else: + logging.info("No match found in any file.") + + def execute_command(self, directory_path, prefix, sample, cell=None): + """Execute the command.""" + + # Change directory and run the snakemake command + date_folder = directory_path.split("/")[-1] + + cmd = [ + f"{snakemake_binary}", + "-s", + "workflow/Snakefile", + "--config", + "genecore=True", + f"genecore_prefix={genecore_prefix}", + f"genecore_date_folder={date_folder}", + f"genecore_regex_element={prefix}", + f'samples_to_process="[{sample}]"', + "multistep_normalisation=True", + "MultiQC=True", + "split_qc_plot=False", + f"publishdir={publishdir_location}", + "email=thomas.weber@embl.de", + f"data_location={data_location}", + "ashleys_pipeline_only=True", + "ashleys_pipeline=True", + "--nolock", + "--rerun-triggers", + "mtime", + ] + + if cell: + cmd = cmd[:-2] + [ + f"{data_location}/{date_folder}/{sample}/multiqc/fastqc/{cell}_1_fastqc.html", + "--rerun-triggers", + "mtime", + "--force", + ] + + logging.info("Running command: %s", " ".join(cmd + profile_dry_run + dry_run_options)) + + process = subprocess.Popen( + cmd + profile_dry_run + dry_run_options, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True + ) + + # Variable to store the penultimate line + penultimate_line = "" + + # Read the output line by line in real-time + for line in iter(process.stdout.readline, ""): + logging.info(line.strip()) # log line in real-time + if line.strip(): # If line is not blank + penultimate_line = line.strip() + + # Wait for the subprocess to finish + process.wait() + logging.info("Return code: %s", process.returncode) + + # Check the penultimate line + if str(process.returncode) == str(0): + self.run_second_command(cmd, profile_slurm, data_location, date_folder, sample, cell) + else: + logging.info("\nThe output is not as expected.") + + def run_second_command(self, cmd, profile_slurm, data_location, date_folder, sample, cell=None): + """Run the second command and write the output to a log file.""" + + report_location = f"{publishdir_location}/{date_folder}/{sample}/reports/{sample}_ashleys-qc-pipeline_report.zip" + report_options = [ + "--report", + f"{report_location}", + "--report-stylesheet", + "/g/korbel2/weber/workspace/mosaicatcher-update/workflow/report/custom-stylesheet.css", + ] + + # Panoptes + pipeline = "ashleys-qc-pipeline" + + wms_monitor_options = "http://127.0.0.1:8058" + run_id = f"{pipeline}--{date_folder}--{sample}" + wms_monitor_renaming_option = f"name={run_id}" + + wms_monitor_args = [ + "--wms-monitor", + f"{wms_monitor_options}", + "--wms-monitor-arg", + f"{wms_monitor_renaming_option}", + ] + + # print(cmd + profile_slurm + report_options) + + logging.info("\nThe output is as expected.") + logging.info("Running command: %s", " ".join(cmd + wms_monitor_args + profile_dry_run)) + + os.makedirs("watchdog/logs/per-run", exist_ok=True) + + # Get the current date and time + now = datetime.now() + + # Convert it to a string + current_time = now.strftime("%Y%m%d%H%M%S") + + with open(f"watchdog/logs/per-run/{date_folder}_{pipeline}_{current_time}.log", "w") as f: + process2 = subprocess.Popen(cmd + wms_monitor_args + profile_dry_run, stdout=f, stderr=f, universal_newlines=True) + # process2 = subprocess.Popen(cmd + profile_slurm, stdout=f, stderr=f, universal_newlines=True) + process2.wait() + + logging.info("Return code: %s", process2.returncode) + + logging.info("Generating ashleys report.") + os.makedirs(os.path.dirname(report_location), exist_ok=True) + # os.makedirs(f"{publishdir_location}/{date_folder}/{sample}/reports/", exist_ok=True) + logging.info("Running command: %s", " ".join(cmd + profile_slurm + report_options)) + # Change the permissions of the new directory + # subprocess.run(["chmod", "-R", "777", f"{data_location}/{date_folder}"]) + + with open(f"watchdog/logs/per-run/{date_folder}_{pipeline}_{current_time}_report.log", "w") as f: + print(cmd + profile_slurm + report_options) + process2 = subprocess.Popen(cmd + profile_dry_run + report_options, stdout=f, stderr=f, universal_newlines=True) + # process2 = subprocess.Popen(cmd + profile_slurm + report_options, stdout=f, stderr=f, universal_newlines=True) + process2.wait() + + logging.info("Return code: %s", process2.returncode) + + # ZIPFILE + + import zipfile + + # Check if the file exists and is a valid zip file + if zipfile.is_zipfile(report_location): + # Specify the directory where you want to extract the contents + # If you want to extract in the same directory as the zip file, just use the parent directory + extract_location = f"{publishdir_location}/{date_folder}/{sample}/reports/" + + # Extract the zip file + with zipfile.ZipFile(report_location, "r") as zip_ref: + zip_ref.extractall(extract_location) + print(f"Extracted the archive to {extract_location}") + else: + print(f"{report_location} is not a valid zip file.") + + # Change the permissions of the new directory + subprocess.run(["chmod", "-R", "777", f"{data_location}/{date_folder}"]) + + +def main(): + # Create the event handler + event_handler = MyHandler() + + # Create an observer + observer = Observer() + + # Assign the observer to the path and the event handler + observer.schedule(event_handler, path_to_watch, recursive=False) + + # Start the observer + observer.start() + + # Start the periodical directory scanning in a separate thread + def periodic_scan(): + while True: + event_handler.check_unprocessed_folder() + time.sleep(3600) # Scan the directory every hour + + scan_thread = threading.Thread(target=periodic_scan) + scan_thread.start() + + try: + while True: + logging.info("Waiting for new plate ...") + time.sleep(3600) + except KeyboardInterrupt: + observer.stop() + + observer.join() + + +if __name__ == "__main__": + main() diff --git a/workflow/Snakefile b/workflow/Snakefile index 980f9ca0..5acf31fe 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,8 +1,6 @@ -from snakemake.utils import min_version +configfile_location = "config/config.yaml" -min_version("7.5.0") -configfile_location = "config/config.yaml" configfile: configfile_location @@ -21,13 +19,10 @@ if config["ashleys_pipeline"] is True: module ashleys_qc: snakefile: - # "../../ashleys-qc-pipeline/workflow/Snakefile" github( "friendsofstrandseq/ashleys-qc-pipeline", path="workflow/Snakefile", - tag=str(config["ashleys_pipeline_version"]) - # branch="main", - # branch="dev", + tag=str(config["ashleys_pipeline_version"]), ) config: config @@ -101,6 +96,9 @@ include: "rules/scNOVA.smk" include: "rules/gc.smk" +include: "rules/debug.smk" + + if config["list_commands"] is False: if config["ashleys_pipeline"] is True: if config["ashleys_pipeline_only"] is False: diff --git a/workflow/envs/dev/debug.yaml b/workflow/envs/dev/debug.yaml new file mode 100644 index 00000000..7bfca424 --- /dev/null +++ b/workflow/envs/dev/debug.yaml @@ -0,0 +1,6 @@ +name: fastqc_env +channels: + - bioconda + - conda-forge +dependencies: + - fastqc diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst index ce78a056..a25de0b9 100644 --- a/workflow/report/workflow.rst +++ b/workflow/report/workflow.rst @@ -1,9 +1,9 @@ .. role:: underline :class: underline - + MosaiCatcher v2 is a `Snakemake `__ pipeline that aims to detect Structural variants from single-cell Strand-seq data. -**Versions used and general parameters:** +**Versions used and general parameters:** * MosaiCatcher version used: {{ snakemake.config["version"] }} * Ashleys version used (if enabled): {{ snakemake.config["ashleys_pipeline_version"] }} @@ -15,7 +15,7 @@ MosaiCatcher v2 is a `Snakemake `__ pipeline that a * Publishdir defined: {{ snakemake.config["publishdir"] }} * Input BAM legacy (bam & selected ; mutually exclusive with ashleys_pipeline): {{ snakemake.config["input_bam_legacy"] }} -**Ashleys-QC parameters (if enabled) +**Ashleys-QC parameters (if enabled)** * Ashleys-QC preprocessing pipeline enabled: {{ snakemake.config["ashleys_pipeline"] }} * Ashleys-QC preprocessing pipeline version used: {{ snakemake.config["ashleys_pipeline_version"] }} diff --git a/workflow/rules/aggregate_fct.smk b/workflow/rules/aggregate_fct.smk index 51cfdd31..278d45b9 100644 --- a/workflow/rules/aggregate_fct.smk +++ b/workflow/rules/aggregate_fct.smk @@ -153,6 +153,30 @@ def aggregate_cells_haplotag_tables(wildcards): ) +def aggregate_cells_scTRIP_multiplot(wildcards): + """ + Function based on checkpoint filter_bad_cells_from_mosaic_count + to process the segmentation only on cells that were flagged as high-quality + Return {cell}.txt + """ + df = pd.read_csv( + checkpoints.filter_bad_cells_from_mosaic_count.get( + sample=wildcards.sample, folder=config["data_location"] + ).output.info, + skiprows=13, + sep="\t", + ) + cell_list = df.cell.tolist() + + return expand( + "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.png", + folder=config["data_location"], + sample=wildcards.sample, + cell=cell_list, + chrom=config["chromosomes"], + ) + + def unselected_input_bam(wildcards): """ Function based on checkpoint filter_bad_cells_from_mosaic_count @@ -290,3 +314,10 @@ def select_binbed(wildcards): return "workflow/data/bin_200kb_all.bed" else: return "workflow/data/mm10.bin_200kb_all.bed" + + +def select_labels(wildcards): + if config["use_strandscape_labels"]: + return "{folder}/{sample}/cell_selection/labels_strandscape.tsv" + else: + return "{folder}/{sample}/cell_selection/labels.tsv" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 999140ba..4c42e1b8 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -110,8 +110,9 @@ if config["ashleys_pipeline_only"] is True: if config["scNOVA"] is True: + # print(config["chromosomes_to_exclude"]) assert ( - "chrY" not in config["chromosomes"] + "chrY" in config["chromosomes_to_exclude"] ), "chrY is not handled by scNOVA yet, please remove it for config['chromosomes'] and add it in config['chomosomes_to_exclude']" @@ -284,10 +285,16 @@ class HandleInput: complete_df_list = list() # List of folders/files to not consider (restrict to samples only) - l_to_process = [e for e in os.listdir(thisdir) if e not in exclude and e.endswith(".zip") is False] + l_to_process = [ + e + for e in os.listdir(thisdir) + if e not in exclude and e.endswith(".zip") is False + ] # print(l_to_process) if config["samples_to_process"]: - l_to_process = [e for e in l_to_process if e in config["samples_to_process"]] + l_to_process = [ + e for e in l_to_process if e in config["samples_to_process"] + ] for sample in l_to_process: # Create a list of files to process for each sample @@ -396,12 +403,13 @@ samples = list(sorted(list(df_config_files.Sample.unique().tolist()))) # scNOVA dedicated - to handle only selected cells labeled by the user /& ashleys if config["scNOVA"] is True: l = list() - + # print(samples) for sample in samples: # Path of the labels file labels_path = "{folder}/{sample}/cell_selection/labels.tsv".format( folder=config["data_location"], sample=sample ) + # print(labels_path) if os.path.exists(labels_path): # Read df tmp_df_labels_selected = pd.read_csv(labels_path, sep="\t")[ @@ -417,6 +425,8 @@ if config["scNOVA"] is True: tmp_df_labels_selected["Selected"] = tmp_df_labels_selected[ "Selected" ].astype(bool) + # print(tmp_df_labels_selected) + # print(df_config_files) # Merge dfs tmp_merge_df = pd.merge( tmp_df_labels_selected, @@ -434,7 +444,7 @@ if config["scNOVA"] is True: ] tmp_merge_df["Selected"] = True l.append(tmp_merge_df) - + # print(l) # Concat df to create a new one df_config_files_with_labels = pd.concat(l).reset_index(drop=True) df_config_files_with_labels.to_csv( @@ -501,7 +511,7 @@ if config["scNOVA"] is True: sep="\t", ) clones[sample] = list(sorted(subclonality_file.Subclonality.unique().tolist())) - # print(clones) +# print(clones) def get_mem_mb(wildcards, attempt): @@ -514,9 +524,13 @@ def get_mem_mb_heavy(wildcards, attempt): return mem_avail[attempt - 1] * 1000 -def onsuccess_fct(log): - config_metadata = config_definitions = yaml.safe_load(open(configfile_location.replace("config.yaml", "config_metadata.yaml"), "r")) - log_path_new = make_log_useful.make_log_useful(log, "SUCCESS", config, config_metadata) +def onsuccess_fct(log): + config_metadata = config_definitions = yaml.safe_load( + open(configfile_location.replace("config.yaml", "config_metadata.yaml"), "r") + ) + log_path_new = make_log_useful.make_log_useful( + log, "SUCCESS", config, config_metadata + ) shell( 'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - SUCCESS" {} < {}'.format( config["version"], config["data_location"], config["email"], log_path_new @@ -525,8 +539,12 @@ def onsuccess_fct(log): def onerror_fct(log): - config_metadata = config_definitions = yaml.safe_load(open(configfile_location.replace("config.yaml", "config_metadata.yaml"), "r")) - log_path_new = make_log_useful.make_log_useful(log, "ERROR", config, config_metadata) + config_metadata = config_definitions = yaml.safe_load( + open(configfile_location.replace("config.yaml", "config_metadata.yaml"), "r") + ) + log_path_new = make_log_useful.make_log_useful( + log, "ERROR", config, config_metadata + ) shell( 'mail -s "[Snakemake] smk-wf-catalog/mosacaitcher-pipeline v{} - Run on {} - ERRROR" {} < {}'.format( config["version"], config["data_location"], config["email"], log_path_new @@ -1046,15 +1064,27 @@ def get_all_plots(wildcards): ), ) - # UCSC section + # scTRIP multiplot - l_outputs.extend( - expand( - "{folder}/{sample}/plots/UCSC/{sample}.bedUCSC.gz", - folder=config["data_location"], - sample=wildcards.sample, - ), - ) + if config["scTRIP_multiplot"] == True: + l_outputs.extend( + expand( + "{folder}/{sample}/plots/scTRIP_multiplot_aggr.ok", + folder=config["data_location"], + sample=wildcards.sample, + ) + ) + + # UCSC + IGV + + if config["genome_browsing_files_generation"] == True: + l_outputs.extend( + expand( + "{folder}/{sample}/plots/IGV/{sample}_IGV_session.xml", + folder=config["data_location"], + sample=wildcards.sample, + ), + ) # Stats section diff --git a/workflow/rules/count.smk b/workflow/rules/count.smk index ec4a43cb..080d64b7 100755 --- a/workflow/rules/count.smk +++ b/workflow/rules/count.smk @@ -108,11 +108,7 @@ if config["ashleys_pipeline"] is False: rule copy_labels: input: - lambda wc: expand( - "{folder}/{sample}/cell_selection/labels.tsv", - folder=config["data_location"], - sample=wc.sample, - ), + select_labels, output: "{folder}/{sample}/config/labels.tsv", log: diff --git a/workflow/rules/debug.smk b/workflow/rules/debug.smk new file mode 100644 index 00000000..54fd9d1c --- /dev/null +++ b/workflow/rules/debug.smk @@ -0,0 +1,25 @@ + +rule fastqc_debug: + input: + fastq="{folder}/{sample}/fastq/{cell}.{pair}.fastq.gz", + fastqc_check="{folder}/{sample}/multiqc/fastqc/{cell}_{pair}_fastqc.html", + labels_strandscape="{folder}/{sample}/cell_selection/labels_strandscape.tsv", + output: + html=report( + "{folder}/{sample}/debug/mosaicatcher_fastqc/{cell}.{pair}_fastqc.html", + category="FastQC", + subcategory="{sample}", + labels={"Sample": "{sample}", "Cell": "{cell}", "Pair": "{pair}"}, + ), + zip="{folder}/{sample}/debug/mosaicatcher_fastqc/{cell}.{pair}_fastqc.zip", + log: + "{folder}/log/fastqc_debug/{sample}/{cell}_{pair}.log", + threads: 1 + resources: + mem_mb=get_mem_mb, + conda: + "../envs/dev/debug.yaml" + params: + outdir=lambda wc, output: config["abs_path"].join(output.zip.split("/")[:-1]), + shell: + "fastqc --outdir {params.outdir} --quiet {input.fastq} " diff --git a/workflow/rules/external_data.smk b/workflow/rules/external_data.smk index 02d84538..71ed4d63 100644 --- a/workflow/rules/external_data.smk +++ b/workflow/rules/external_data.smk @@ -142,9 +142,11 @@ rule download_arbigent_mappability_track: rule download_scnova_data: input: - HTTP.remote( - "https://zenodo.org/record/7697400/files/scNOVA_data_models.zip", - keep_local=True, + ancient( + HTTP.remote( + "https://zenodo.org/record/7697400/files/scNOVA_data_models.zip", + keep_local=True, + ) ), output: "workflow/data/scNOVA/utils/bin_chr_length.bed", diff --git a/workflow/rules/gc.smk b/workflow/rules/gc.smk index d59e8e8c..73d7608a 100644 --- a/workflow/rules/gc.smk +++ b/workflow/rules/gc.smk @@ -76,32 +76,34 @@ if ( script: "../scripts/GC/variance_stabilizing_transformation.R" - rule populate_counts_GC: + rule reformat_ms_norm: input: - bin_bed="workflow/data/bin_200kb_all.bed", - counts="{folder}/{sample}/counts/multistep_normalisation/{sample}.txt.scaled.GC.VST.gz", + "{folder}/{sample}/counts/multistep_normalisation/{sample}.txt.scaled.GC.VST.gz", output: - populated_counts="{folder}/{sample}/counts/multistep_normalisation/{sample}.txt.scaled.GC.VST.populated.gz", + "{folder}/{sample}/counts/multistep_normalisation/{sample}.txt.scaled.GC.VST.reformat.gz", log: - "{folder}/log/plot_mosaic_counts/{sample}.log", + "{folder}/log/reformat_ms_norm/{sample}.log", conda: "../envs/mc_base.yaml" resources: mem_mb=get_mem_mb, script: - "../scripts/utils/populated_counts_for_qc_plot.py" + "../scripts/utils/reformat_ms_norm.py" - rule reformat_ms_norm: + rule populate_counts_GC: input: - "{folder}/{sample}/counts/multistep_normalisation/{sample}.txt.scaled.GC.VST.gz", + bin_bed="workflow/data/bin_200kb_all.bed", + counts="{folder}/{sample}/counts/multistep_normalisation/{sample}.txt.scaled.GC.VST.reformat.gz", output: - "{folder}/{sample}/counts/multistep_normalisation/{sample}.txt.scaled.GC.VST.reformat.gz", + populated_counts="{folder}/{sample}/counts/multistep_normalisation/{sample}.txt.scaled.GC.VST.populated.gz", + log: + "{folder}/log/populate_counts_GC/{sample}.log", conda: "../envs/mc_base.yaml" resources: mem_mb=get_mem_mb, script: - "../scripts/utils/reformat_ms_norm.py" + "../scripts/utils/populated_counts_for_qc_plot.py" rule plot_mosaic_gc_norm_counts: input: diff --git a/workflow/rules/plots.smk b/workflow/rules/plots.smk index 6b016fc0..1acc1e55 100644 --- a/workflow/rules/plots.smk +++ b/workflow/rules/plots.smk @@ -54,27 +54,6 @@ rule divide_pdf: "../scripts/plotting/dividing_pdf.py" -# rule divide_pdf: -# input: -# "{folder}/{sample}/plots/counts/CountComplete.{plottype}.pdf", -# output: -# report( -# "{folder}/{sample}/plots/counts_{plottype}/{cell}.{i, \d+}.pdf", -# caption="../report/mosaic_counts.rst", -# category="Mosaic counts", -# subcategory="{sample}", -# labels={"Cell": "{cell}", "Nb": "{i}", "Type": "{plottype}"}, -# ), -# log: -# "{folder}/log/{sample}/plots/counts_{plottype}/{cell}.{i, \d+}.log", -# conda: -# "../envs/mc_base.yaml" -# resources: -# mem_mb=get_mem_mb, -# script: -# "../scripts/plotting/dividing_pdf.py" - - rule final_results: input: get_all_plots, @@ -90,7 +69,9 @@ rule final_results: rule plot_SV_consistency_barplot: input: - sv_calls="{folder}/{sample}/mosaiclassifier/sv_calls/{method}_filter{filter}.tsv", + sv_calls=( + "{folder}/{sample}/mosaiclassifier/sv_calls/{method}_filter{filter}.tsv" + ), output: barplot_bypos=report( "{folder}/{sample}/plots/sv_consistency/{method}_filter{filter}.consistency-barplot-bypos.pdf", @@ -126,7 +107,9 @@ rule plot_SV_consistency_barplot: rule plot_clustering: input: - sv_calls="{folder}/{sample}/mosaiclassifier/sv_calls/{method}_filter{filter}.tsv", + sv_calls=( + "{folder}/{sample}/mosaiclassifier/sv_calls/{method}_filter{filter}.tsv" + ), binbed=ancient("workflow/data/bin_200kb_all.bed"), output: position=report( @@ -151,7 +134,9 @@ rule plot_clustering: rule plot_clustering_position_dev: input: - sv_calls="{folder}/{sample}/mosaiclassifier/sv_calls/{method}_filter{filter}.tsv", + sv_calls=( + "{folder}/{sample}/mosaiclassifier/sv_calls/{method}_filter{filter}.tsv" + ), output: pdf=report( "{folder}/{sample}/plots/sv_clustering_dev/{method}-filter{filter}-position.pdf", @@ -176,7 +161,9 @@ rule plot_clustering_position_dev: rule plot_clustering_chromosome_dev: input: - sv_calls="{folder}/{sample}/mosaiclassifier/sv_calls/{method}_filter{filter}.tsv", + sv_calls=( + "{folder}/{sample}/mosaiclassifier/sv_calls/{method}_filter{filter}.tsv" + ), binbed=ancient(select_binbed), cluster_order_df="{folder}/{sample}/plots/sv_clustering_dev/clustering_{method}-filter{filter}-position.tsv", output: @@ -204,11 +191,15 @@ rule plot_SV_calls: input: counts="{folder}/{sample}/counts/{sample}.txt.gz", calls="{folder}/{sample}/mosaiclassifier/sv_calls/{method}_filter{filter}.tsv", - complex_calls="{folder}/{sample}/mosaiclassifier/complex/{method}_filter{filter}.tsv", + complex_calls=( + "{folder}/{sample}/mosaiclassifier/complex/{method}_filter{filter}.tsv" + ), strand="{folder}/{sample}/strandphaser/StrandPhaseR_final_output.txt", segments="{folder}/{sample}/segmentation/Selection_jointseg.txt", scsegments="{folder}/{sample}/segmentation/Selection_singleseg.txt", - grouptrack="{folder}/{sample}/mosaiclassifier/postprocessing/group-table/{method}.tsv", + grouptrack=( + "{folder}/{sample}/mosaiclassifier/postprocessing/group-table/{method}.tsv" + ), output: report( "{folder}/{sample}/plots/sv_calls/{method}_filter{filter}/{chrom}.pdf", @@ -246,11 +237,15 @@ rule plot_SV_calls_dev: input: counts="{folder}/{sample}/counts/{sample}.txt.gz", calls="{folder}/{sample}/mosaiclassifier/sv_calls/{method}_filter{filter}.tsv", - complex_calls="{folder}/{sample}/mosaiclassifier/complex/{method}_filter{filter}.tsv", + complex_calls=( + "{folder}/{sample}/mosaiclassifier/complex/{method}_filter{filter}.tsv" + ), strand="{folder}/{sample}/strandphaser/StrandPhaseR_final_output.txt", segments="{folder}/{sample}/segmentation/Selection_jointseg.txt", scsegments="{folder}/{sample}/segmentation/Selection_singleseg.txt", - grouptrack="{folder}/{sample}/mosaiclassifier/postprocessing/group-table/{method}.tsv", + grouptrack=( + "{folder}/{sample}/mosaiclassifier/postprocessing/group-table/{method}.tsv" + ), output: report( "{folder}/{sample}/plots/sv_calls_dev/{method}_filter{filter}/{chrom}.pdf", @@ -303,11 +298,49 @@ rule plot_ploidy: "../scripts/plotting/ploidy_plot.py" +rule scTRIP_multiplot: + input: + install_check="workflow/config/scTRIP_multiplot.ok", + counts="{folder}/{sample}/counts/{sample}.txt.gz", + haplotag_bam="{folder}/{sample}/haplotag/bam/{cell}.bam.htg", + sv_counts="{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv", + output: + figure=report( + "{folder}/{sample}/plots/scTRIP_multiplot/{cell}/{chrom}.png", + category="scTRIP multiplot", + subcategory="{sample}", + labels={"Cell": "{cell}", "Chrom": "{chrom}"}, + ), + log: + "{folder}/log/scTRIP_multiplot/{sample}/{cell}/{chrom}.log", + conda: + "../envs/rtools.yaml" + resources: + mem_mb=get_mem_mb, + shell: + "LC_CTYPE=C Rscript workflow/scripts/plotting/scTRIP_multiplot/scTRIP_multiplot_run.R {input.counts} {input.haplotag_bam} {input.sv_counts} {wildcards.chrom} {wildcards.cell} {output.figure} > {log} 2>&1" + + +rule scTRIP_multiplot_aggr: + input: + aggregate_cells_scTRIP_multiplot, + output: + touch("{folder}/{sample}/plots/scTRIP_multiplot_aggr.ok"), + log: + "{folder}/log/scTRIP_multiplot_aggr/{sample}.log", + resources: + mem_mb=get_mem_mb, + + rule ucsc_genome_browser_file: input: counts="{folder}/{sample}/counts/{sample}.txt.gz", - stringent_calls="{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv", - lenient_calls="{folder}/{sample}/mosaiclassifier/sv_calls/lenient_filterFALSE.tsv", + stringent_calls=( + "{folder}/{sample}/mosaiclassifier/sv_calls/stringent_filterTRUE.tsv" + ), + lenient_calls=( + "{folder}/{sample}/mosaiclassifier/sv_calls/lenient_filterFALSE.tsv" + ), output: "{folder}/{sample}/plots/UCSC/{sample}.bedUCSC.gz", log: @@ -318,3 +351,33 @@ rule ucsc_genome_browser_file: mem_mb=get_mem_mb, shell: "python workflow/scripts/plotting/ucsc_vizu.py {input.counts} {input.stringent_calls} {input.lenient_calls} {output} > {log}" + + +rule split_ucsc_into_individual_tracks: + input: + ucsc_file="{folder}/{sample}/plots/UCSC/{sample}.bedUCSC.gz", + output: + output_dir=directory("{folder}/{sample}/plots/IGV/SPLITTED"), + log: + "{folder}/log/split_ucsc_into_individual_tracks/{sample}.log", + conda: + "../envs/mc_base.yaml" + resources: + mem_mb=get_mem_mb, + shell: + "sh workflow/scripts/plotting/split_ucsc_file.sh {input.ucsc_file} {output.output_dir}" + + +rule generate_igv_session: + input: + splitted_files_dir="{folder}/{sample}/plots/IGV/SPLITTED", + output: + xml_session="{folder}/{sample}/plots/IGV/{sample}_IGV_session.xml", + log: + "{folder}/log/generate_igv_session/{sample}.log", + conda: + "../envs/mc_base.yaml" + resources: + mem_mb=get_mem_mb, + shell: + "sh workflow/scripts/plotting/generate_IGV_session.sh {input.splitted_files_dir} {output.xml_session}" diff --git a/workflow/rules/setup.smk b/workflow/rules/setup.smk index 3bd12fc0..e59889ec 100644 --- a/workflow/rules/setup.smk +++ b/workflow/rules/setup.smk @@ -18,17 +18,33 @@ rule install_BSgenome_package: config["reference"] ), params: - selected_package=lambda wc, input: "BSgenome.Hsapiens.UCSC.{}".format( + selected_package=lambda wc, input: "BSgenome.{}.UCSC.{}".format( + "Mmusculus" if config["reference"] == "mm10" else "Hsapiens", config["reference"] - ) - if config["reference"] in ["hg38", "hg19", "mm10"] - else input.package, + ) if config["reference"] in ["hg38", "hg19", "mm10"] else input.package, conda: "../envs/rtools.yaml" resources: mem_mb=get_mem_mb_heavy, - script: - "../scripts/utils/install_R_package.R" + shell: + "Rscript workflow/scripts/utils/install_R_package.R {params.selected_package}" + + +rule install_sctrip_multiplot_package: + input: + package=bsgenome_install, + output: + touch("workflow/config/scTRIP_multiplot.ok"), + log: + "log/install_sctrip_multiplot_package.log", + params: + selected_package="workflow/scripts/plotting/scTRIP_multiplot/scTRIPmultiplot", + conda: + "../envs/rtools.yaml" + resources: + mem_mb=get_mem_mb_heavy, + shell: + "Rscript workflow/scripts/utils/install_R_package.R {params.selected_package}" rule config_run_summary: diff --git a/workflow/scripts/plotting/generate_IGV_session.sh b/workflow/scripts/plotting/generate_IGV_session.sh new file mode 100644 index 00000000..eb0a2d23 --- /dev/null +++ b/workflow/scripts/plotting/generate_IGV_session.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +# if [ "$#" -ne 1 ]; then +# echo "Usage: $0 " +# exit 1 +# fi + +# track_dir=${snakemake_input[splitted_files_dir]} +track_dir="$1" +# output_file=${snakemake_output[igv_session]} +output_file="$2" + +# Check if the directory exists +if [ ! -d "$track_dir" ]; then + echo "Error: Directory '$track_dir' not found!" + exit 1 +fi +# Start of the XML session +{ + echo ' + + ' + + # Loop through the bedGraph and bed files in the directory and populate the Resources section + for file in "$track_dir"/*.bed*; do + echo " " + done + + echo ' ' + echo ' ' + + # Loop through the bedGraph and bed files in the directory and populate the Panel section + for file in "$track_dir"/*.bed*; do + base=$(basename "$file" | sed "s/\.bed.*//g") + # echo "$base" + if [[ $file == *_1W.bedGraph ]]; then + echo " " + echo " " + echo " " + elif [[ $file == *_2C.bedGraph ]]; then + echo " " + echo " " + echo " " + elif [[ $file == *_3SVstringent.bed ]]; then + echo " " + echo " " + echo " " + fi + done + + echo ' ' + echo '' +} >"$output_file" diff --git a/workflow/scripts/plotting/scTRIP_multiplot/scTRIP_multiplot_run.R b/workflow/scripts/plotting/scTRIP_multiplot/scTRIP_multiplot_run.R new file mode 100644 index 00000000..81485e52 --- /dev/null +++ b/workflow/scripts/plotting/scTRIP_multiplot/scTRIP_multiplot_run.R @@ -0,0 +1,21 @@ +library(scTRIPmultiplot) +args <- commandArgs(TRUE) + + +counts_path <- args[1] +haplo_path <- args[2] +sv_path <- args[3] +chromosome <- args[4] +cell_id <- args[5] +savepath <- args[6] + + +scTRIPmultiplot::generate_multiplot( + counts_path = counts_path, + haplo_path = haplo_path, + chromosome = chromosome, + cell_id = cell_id, + savepath = savepath, + sv_path = sv_path, + size = 1 +) diff --git a/workflow/scripts/plotting/split_ucsc_file.sh b/workflow/scripts/plotting/split_ucsc_file.sh new file mode 100644 index 00000000..b4b0643c --- /dev/null +++ b/workflow/scripts/plotting/split_ucsc_file.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +input_gz_file="$1" + +# Check if the file exists +if [ ! -f "$input_gz_file" ]; then + echo "Error: File '$input_gz_file' not found!" + exit 1 +fi + +output_dir="$2" +mkdir -p "$output_dir" + +# Process the gzipped file and split it into separate tracks +zcat "$input_gz_file" | awk -v outdir="$output_dir" ' +/^track/ { + if(filename != "") close(filename); + + # Extract track name and modify it for filename + if($0 ~ /name="([^"]+)"/) { + filename = gensub(/.*name="([^"]+)".*/, "\\1", 1); + } else if ($0 ~ /name=([^ ]+)/) { + filename = gensub(/.*name=([^ ]+).*/, "\\1", 1); + } + + filename = (filename ~ /_W$/ ? gensub(/_W$/, "_1W.bedGraph", 1, filename) : + (filename ~ /_C$/ ? gensub(/_C$/, "_2C.bedGraph", 1, filename) : + (filename ~ /_SV_stringent$/ ? gensub(/_SV_stringent$/, "_3SVstringent.bed", 1, filename) : filename))); + + # Redirect to the specified output directory + filename = outdir "/" filename; +} +{ + print $0 > filename; +}' + +echo "Tracks have been saved to $output_dir/" diff --git a/workflow/scripts/scNOVA_scripts/infer_diff_gene_expression.R b/workflow/scripts/scNOVA_scripts/infer_diff_gene_expression.R index b9d45680..3443daa0 100644 --- a/workflow/scripts/scNOVA_scripts/infer_diff_gene_expression.R +++ b/workflow/scripts/scNOVA_scripts/infer_diff_gene_expression.R @@ -10,7 +10,7 @@ library(umap) library(pheatmap) library(gplots) -filename = args[8] +filename <- args[8] prefix <- strsplit(filename, "scNOVA_result_plots")[[1]][1] @@ -244,8 +244,9 @@ plot(tsne_out$Y, pch = 16, xlab = "t-SNE1", ylab = "t-SNE2", cex = 1, col = data umap_out <- umap(ind.coord[, 1:10]) +# saveRDS(umap_out, file = "umap_out.rds") plot(umap_out$layout, pch = 16, xlab = "UMAP1", ylab = "UMAP2", cex = 1, col = data_lab_mat_sub) -# write.table(umap_out$layout, file = "/Users/jeong/Documents/Strand_Seq/Deeptool/deeptool_ATAC/Active_X_haplo_analysis/LCL_GM20509/output_umap_500genes.txt", row.names = TRUE, col.names = TRUE, sep = "\t", quote = FALSE) +# write.table(umap_out$layout, file = "/scratch/tweber/DATA/MC_DATA/PAPER_RUNS/TMP/scNOVA/umap.txt", row.names = TRUE, col.names = TRUE, sep = "\t", quote = FALSE) @@ -254,6 +255,7 @@ dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~cond # keep <- rowSums(counts(dds)) >= 10 # dds <- dds[keep,] dds <- DESeq(dds) +# save(dds, file = "/scratch/tweber/DATA/MC_DATA/PAPER_RUNS/TMP/scNOVA/dds.RData") normcount <- counts(dds, normalized = TRUE) res <- results(dds, contrast = c("condition", "clone2", "clone1")) @@ -319,8 +321,10 @@ if (sum(input_matrix_sort_woMT$blacklist == 0 & res_sort_woMT$padj < 0.1 & is.na breaksList <- append(breaksList, 2) breaksList <- append(breaksList, -2, 0) mycol <- colorpanel(n = length(breaksList) - 1, low = "blue", mid = "white", high = "red") - res <- pheatmap(normlogt[class_label == "clone1" | class_label == "clone2", input_matrix_sort_woMT$blacklist == 0 & res_sort_woMT$padj < 0.1 & is.na(res_sort_woMT$padj) == 0], show_rownames = F, show_colnames = T, cluster_cols = T, cluster_rows = T, scale = "column", col = mycol, breaks = breaksList, clustering_distance_rows = "euclidean", cex = 0.8, annotation_row = row_annotation, annotation_colors = anno_colors, clustering_method = "ward.D") + heatmap_data <- normlogt[class_label == "clone1" | class_label == "clone2", input_matrix_sort_woMT$blacklist == 0 & res_sort_woMT$padj < 0.1 & is.na(res_sort_woMT$padj) == 0] + # save(heatmap_data, breaksList, row_annotation, anno_colors, file = "/scratch/tweber/DATA/MC_DATA/PAPER_RUNS/TMP/scNOVA/heatmap_data_and_breaks.RData") + res <- pheatmap(heatmap_data, show_rownames = F, show_colnames = T, cluster_cols = T, cluster_rows = T, scale = "column", col = mycol, breaks = breaksList, clustering_distance_rows = "euclidean", cex = 0.8, annotation_row = row_annotation, annotation_colors = anno_colors, clustering_method = "ward.D") ## Without clustering row normlogt_sort <- rbind(normlogt[class_label == "clone1", ], normlogt[class_label == "clone2", ]) diff --git a/workflow/scripts/utils/install_R_package.R b/workflow/scripts/utils/install_R_package.R index 00d8386c..ebf2e346 100644 --- a/workflow/scripts/utils/install_R_package.R +++ b/workflow/scripts/utils/install_R_package.R @@ -1,19 +1,34 @@ -package <- snakemake@params[["selected_package"]] -# package <- "workflow/data/ref_genomes/BSgenome.T2T.CHM13.V2_1.0.0.tar.gz" -# print(grepl("BSgenome.T2T.CHM13.V2_1.0.0.tar.gz", package, fixed = TRUE, perl = FALSE)) +# package <- snakemake@params[["selected_package"]] +args <- commandArgs(TRUE) +package <- args[1] +# Check if the package is already available is_package_available <- require(package, character.only = TRUE) if (!isTRUE(is_package_available)) { + # Ensure BiocManager is available since it will be needed regardless of the condition if (!require("BiocManager", quietly = TRUE)) { install.packages("BiocManager", repos = "http://cran.us.r-project.org") } + + # Condition 1: Install custom tar.gz Bsgenome package named BSgenome.T2T.CHM13.V2_1.0.0.tar.gz if (grepl("BSgenome.T2T.CHM13.V2_1.0.0.tar.gz", package, fixed = TRUE, perl = FALSE)) { - print("T2T") BiocManager::install("GenomeInfoDbData", update = FALSE) install.packages(package, repos = NULL, type = "source") - } else { + + # Condition 2: Install standard Bsgenome packages (hg38/hg19/mm10) + } else if (package %in% c("BSgenome.Hsapiens.UCSC.hg38", "BSgenome.Hsapiens.UCSC.hg19", "BSgenome.Mmusculus.UCSC.mm10")) { BiocManager::install(package, update = FALSE) + + # Condition 3: Install a custom package using devtools + } else { + # Ensure devtools is installed + if (!require("devtools", quietly = TRUE)) { + install.packages("devtools", repos = "http://cran.us.r-project.org") + } + devtools::install(package, dependencies = TRUE, update = FALSE) } - quit(save = "no") + + # Exit after installation, if desired + # quit(save = "no") } diff --git a/workflow/scripts/utils/populated_counts_for_qc_plot.py b/workflow/scripts/utils/populated_counts_for_qc_plot.py index 2914616e..c597d022 100644 --- a/workflow/scripts/utils/populated_counts_for_qc_plot.py +++ b/workflow/scripts/utils/populated_counts_for_qc_plot.py @@ -8,7 +8,7 @@ sep="\t", names=["chrom", "start", "end", "bin_id"], ) -binbed["ID"] = binbed["chrom"] + "_" + binbed["start"].astype(str) + "_" + binbed["end"].astype(str) +binbed["ID"] = binbed["chrom"].astype(str) + "_" + binbed["start"].astype(str) + "_" + binbed["end"].astype(str) # Turn chrom into categorical binbed["chrom"] = pd.Categorical( @@ -21,14 +21,13 @@ binbed = binbed.sort_values(by=["chrom", "start", "end"]).reset_index(drop=True) binbed["w"], binbed["c"], binbed["class"] = 0, 0, None - # Read SV file # df = pd.read_csv("../../../../mosaicatcher-update/.tests/data_CHR17/RPE-BM510/counts/RPE-BM510.txt.raw.gz", sep="\t") # sep = "," if "/multistep_normalisation/" in snakemake.input.counts else "\t" sep = "\t" df = pd.read_csv(snakemake.input.counts, sep=sep, compression="gzip") -df["ID"] = df["chrom"] + "_" + df["start"].astype(str) + "_" + df["end"].astype(str) +df["ID"] = df["chrom"].astype(str) + "_" + df["start"].astype(str) + "_" + df["end"].astype(str) df["w"] = df["w"].round(0).astype(int) df["c"] = df["c"].round(0).astype(int) if sep == ",": @@ -39,7 +38,6 @@ # Loop over cells for cell in df.cell.unique().tolist(): - # Outer join to retrieve both real count values from specified chromosome and empty bins tmp_df = pd.concat( [