Skip to content

Commit

Permalink
ripples filtration pipeline scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
mrkylesmith committed Jun 21, 2022
1 parent 575e546 commit 0f8231a
Show file tree
Hide file tree
Showing 34 changed files with 4,159 additions and 4 deletions.
27 changes: 23 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ AUX_SOURCE_DIRECTORY(src/matOptimize/apply_move patch_tree)
file(GLOB MATUTIL_SRCS "src/matUtils/*.cpp" "src/matUtils/*.hpp")
file(GLOB RIPPLES_SRCS "src/ripples/*.cpp" "src/ripples/*.hpp")
file(GLOB RIPPLES_FAST_SRCS "src/ripples/ripples_fast/*.cpp" "src/ripples/ripples_fast/*.hpp")
file(GLOB RIPPLES_UTILS_SRCS "src/ripples/util/*.cpp" "src/ripples/util/*.hpp")

set_source_files_properties(src/mutation_annotated_tree.cpp PROPERTIES COMPILE_FLAGS -O3)
#set_source_files_properties(src/usher_mapper.cpp PROPERTIES COMPILE_FLAGS -O3)
Expand Down Expand Up @@ -164,9 +165,6 @@ if(DEFINED Protobuf_PATH)
TARGET compareVCF
PROTOS parsimony.proto)




protobuf_generate(
LANGUAGE cpp
TARGET usher
Expand Down Expand Up @@ -255,6 +253,24 @@ else()
${PROTO_HDRS}
)

add_executable(ripplesUtils
src/mutation_annotated_tree.cpp
src/usher_mapper.cpp
src/matUtils/describe.cpp
{RIPPLES_UTILS_SRCS}
${PROTO_SRCS}
${PROTO_HDRS}
)

add_executable(ripplesInit
src/mutation_annotated_tree.cpp
src/usher_mapper.cpp
src/ripples/init/init_pipeline.cpp
src/ripples/init/main.cpp
${PROTO_SRCS}
${PROTO_HDRS}
)

add_executable(compareVCF
src/mutation_annotated_tree.cpp
src/compareVCF.cpp
Expand Down Expand Up @@ -362,6 +378,9 @@ target_include_directories(usher PUBLIC "${PROJECT_BINARY_DIR}")
TARGET_COMPILE_OPTIONS(matUtils PRIVATE -DTBB_SUPPRESS_DEPRECATED_MESSAGES)
TARGET_LINK_LIBRARIES(matUtils PRIVATE stdc++ ${Boost_LIBRARIES} ${TBB_IMPORTED_TARGETS} ${Protobuf_LIBRARIES}) # OpenMP::OpenMP_CXX)

TARGET_LINK_LIBRARIES(ripplesUtils PRIVATE stdc++ ${Boost_LIBRARIES} ${TBB_IMPORTED_TARGETS} ${Protobuf_LIBRARIES}) # OpenMP::OpenMP_CXX)
TARGET_LINK_LIBRARIES(ripplesInit PRIVATE stdc++ ${Boost_LIBRARIES} ${TBB_IMPORTED_TARGETS} ${Protobuf_LIBRARIES}) # OpenMP::OpenMP_CXX)

TARGET_COMPILE_OPTIONS(ripples PRIVATE -DTBB_SUPPRESS_DEPRECATED_MESSAGES)
TARGET_LINK_LIBRARIES(ripples PRIVATE stdc++ ${Boost_LIBRARIES} ${TBB_IMPORTED_TARGETS} ${Protobuf_LIBRARIES}) # OpenMP::OpenMP_CXX)
TARGET_LINK_LIBRARIES(ripples-fast PRIVATE stdc++ ${Boost_LIBRARIES} ${TBB_IMPORTED_TARGETS} ${Protobuf_LIBRARIES}) # OpenMP::OpenMP_CXX)
Expand All @@ -372,4 +391,4 @@ if(USHER_SERVER)
install(TARGETS usher matUtils matOptimize ripples usher_server DESTINATION bin)
else()
install(TARGETS usher matUtils matOptimize ripples DESTINATION bin)
endif()
endif()
1 change: 1 addition & 0 deletions install/installUbuntu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ cd isa-l-2.30.0
make -j$(nproc)
sudo -E make install
cd ..

#download and install TBB
wget https://github.com/oneapi-src/oneTBB/archive/2019_U9.tar.gz
tar -xvzf 2019_U9.tar.gz
Expand Down
65 changes: 65 additions & 0 deletions scripts/recombination/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
FROM ubuntu:20.04
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
ENV DEBIAN_FRONTEND=noninteractive
USER root

RUN apt-get update && apt-get install -yq --no-install-recommends \
build-essential git wget vim curl rsync python3 python3-pip cmake ninja-build jq \
bzip2 gnupg2 squashfs-tools openmpi-bin \
libboost-all-dev \
libprotoc-dev libprotoc-dev protobuf-compiler \
libtbb-dev \
mpich libmpich-dev automake libtool autoconf make nasm \
ca-certificates \
apt-transport-https gnupg \
lsb-core \
sudo

# gcloud
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
apt-get update -y && \
apt-get install -y google-cloud-sdk

# gcsfuse
ENV GCSFUSE_REPO=gcsfuse-focal
RUN echo "deb http://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - && \
apt-get update -y && \
apt-get install -yq gcsfuse

# mafft build
RUN git clone https://github.com/GSLBiotech/mafft && \
cd mafft/core && \
make -j$(grep -c ^processor /proc/cpuinfo 2>/dev/null || sysctl -n hw.ncpu) && \
make install

# Install conda
RUN curl -Ol https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh
RUN bash Miniconda3-py38_4.10.3-Linux-x86_64.sh -b

ENV PATH="/root/miniconda3/bin:${PATH}"

RUN conda install mamba -n base -c conda-forge
RUN mamba install -y -c conda-forge -c bioconda snakemake-minimal numpy pyyaml
RUN pip3 install chronumental

# Install faSomeRecords
RUN rsync -aP rsync://hgdownload.soe.ucsc.edu/genome/admin/exe/linux.x86_64/faSomeRecords /usr/bin
RUN chmod +x /usr/bin/faSomeRecords

WORKDIR /HOME

RUN git clone https://github.com/yatisht/usher.git
WORKDIR usher

RUN ./install/installUbuntu.sh
RUN apt-get install -y parallel

# Install 3seq
RUN cd scripts/recombination/filtering && \
./3seq_install.sh

# Set the path
ENV PATH="/HOME/usher/build:/HOME/kentsource:${PATH}"
WORKDIR scripts/recombination
146 changes: 146 additions & 0 deletions scripts/recombination/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
![Ripples](images/ripples_logo.png)

# **Recombination Inference using Phylogenetic PLacEmentS** (RIPPLES)

RIPPLES is a program to detect recombination events in large mutation annotated trees (MAT). This repo contains a workflow for running RIPPLES on Google Cloud Platform.

Please also refer to this tutorial for examples using RIPPLES : [RIPPLES Tutorial](https://usher-wiki.readthedocs.io/en/latest/ripples.html)

# RIPPLES on Google Cloud Platform

<br>

## Setup your Google Cloud Platform Account
___

1. **Setup Cloud Console:**
- If needed, please follow these instructions to open Cloud Console, create a project storage bucket (`bucket_id`) and project ID (`project_id`):
[Installation and Setup](https://cloud.google.com/deployment-manager/docs/step-by-step-guide/installation-and-setup)

<br>

2. **Add a service account**:
- Click the Navagation Menu side bar on the GCP Console and go to `IAM & Admin` -> `Service Accounts`. Click `+Create Service Account`.

<br>

3. **Create and Download Keys(JSON)**
- Once you have created a service account, you need to add keys to this serivce account.
Click the Navagation Menu side bar on the web console and go to `IAM & Admin` -> `Service Accounts` and click on the active service account you just created from the previous step.

- Click the `Keys` tab and `ADD KEY` and `Create new key`. Select `JSON` key type. A new `<key>.json` file will automatically be downloaded from your browser.

- Move this downloaded `<key>.json` file to the following location (or edit the command below for the location of your choice):

```
~/.config/gcloud/<key>.json
```

- Then run the following command in your terminal to set the environment variable path to the location where you just placed your downloaded `<keys>.json` file.

```
KEY=~/.config/gcloud/<keys>.json
```

<br>


## Run RIPPLES (Docker workflow)
___

Pull and run public RIPPLES Docker image with the following command. Replace the name of your `<keys>.json` file in the command below:
```
docker run -it -v ${KEY}:/tmp/keys/<keys>.json:ro mrkylesmith/ripples_pipeline:latest
```

This will place you into an interactive (`-it` flag) Docker container shell where you will be able to launch RIPPLES jobs on GCP. The Docker image is configured with all the necessary installs and dependencies needed to run RIPPLES pipeline.

<br>


## Setup Access to GCP Account and configure RIPPLES job
___

There is a blank template configuration file located in the `template/` directory. Copy it into the current directory and edit the file, following the instructions below to set your configurations.
```
cp template/ripples.yaml .
```

### Grant access to GCP account
- Add your bucket ID, project ID, and the name of your downloaded `<keys>.json` file to the `ripples.yaml` configuration file like this:
```
bucket_id: <your_bucket_id>
project_id: <your_project_id>
key_file: /tmp/keys/<your_key_file.json>
```

<br>

## Configure RIPPLES parameters and GCP instance type
___
This RIPPLES workflow takes two main inputs described in more detail below:
- MAT protobuf
- raw sequence(`.fa`) file that you have placed in your Google Cloud Storage Bucket (`bucket_id`)

Set configurations for the current RIPPLES job you want to run in `ripples.yaml` , shown below:
```
# Ripples parameters config
version: ripples-fast
mat: <mat.pb>
date: <2021-07-02>
raw_sequences: <raw_sequences>.fa
reference: <reference.fa>
results: <results>
# GCP machine and Storage Bucket config
instances: 4
boot_disk_size: 30
machine_type: e2-standard-16
logging: <example.log>
```
### RIPPLES Options:
- `version`: Set as `ripples` or `ripples-fast` to run fast verison of RIPPLES program

- `mat`: The Mutation Annotated Tree (MAT) protobuf that you want to search for recombination events.

- `date`: Set the date for the given input tree.

- `raw_sequences`: A raw sequence file that contains all the raw sequences for a set of descendent nodes in the tree.

- `reference`: SARS-CoV-2 reference genome

- `results`: The output directory where RIPPLES will output results on GCP Storage Bucket and locally within interactive Docker shell where RIPPLES job is launched.
NOTE: You should create your empty `results` directory on GCP Storage Bucket before you run this pipeline.

- `num_descendants`: Minimum number of leaves a node should have to be considered for recombination.[OPTIONAL] (Default = 2)

### GCP Instance Type Options:
- `instances`: Number of GCP instances that RIPPLES will be parallelized across. Results will be automatically aggregated into `results` directory on your GCP Storage Bucket and locally when all RIPPLES jobs are complete.

- `boot_disk_size`: Instance startup disk size. **Leave as 30GB**.

- `machine_type`: Select the GCP instance type you would like to launch.

- `logging`: Name of the logging file for this particular RIPPLES job that will be output into your GCP Storage bucket under `bucket_id/logging/<logging>`.

<br>

**Note:** All of the configurations above should be updated/changed as needed for each separate RIPPLES job that is run.

<br>

## Running your RIPPLES job
___

Execute the following command to launch your RIPPLES job according to the set configurations in `ripples.yaml`.
```
python3 run.py
```
Once all jobs are complete, the dectected recombinants will be placed in the specified `<results/>` directory.

## RIPPLES Output
The following two files will be output:
- `recombinants_<date>.txt`: This is the final output file, containing all dectected recombinants found in the input tree, one per line.

- `unfiltered_recombinants<date>.txt`: File containing the unfiltered recombinants that were detected after running RIPPLES search, but not run through filtration/QC pipeline.
9 changes: 9 additions & 0 deletions scripts/recombination/filtering/3seq_install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash -ex
#
# Install 3seq
startDir=$PWD
git clone https://gitlab.com/mrkylesmith/3seq.git
cd 3seq
make
./3seq -g my3seqTable700 700
cd $startDir
Loading

0 comments on commit 0f8231a

Please sign in to comment.