From d5fdf666b1d354d70cbc6ce707b03dcf8a2fc8f2 Mon Sep 17 00:00:00 2001 From: clegendre Date: Thu, 26 Jan 2023 16:13:54 -0700 Subject: [PATCH] removing indels first from vcf to avoid issues with phASER then process VCF as before --- ....phasing_consecutives_variants_as_blocs.sh | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/prep_vcfs_somatic/strelka2/strelka2.phasing_consecutives_variants_as_blocs.sh b/prep_vcfs_somatic/strelka2/strelka2.phasing_consecutives_variants_as_blocs.sh index bc86c02..81bfd30 100755 --- a/prep_vcfs_somatic/strelka2/strelka2.phasing_consecutives_variants_as_blocs.sh +++ b/prep_vcfs_somatic/strelka2/strelka2.phasing_consecutives_variants_as_blocs.sh @@ -45,7 +45,7 @@ echo -e "USAGE: $0 \$VCF.gz \$TBAM \$SNAME_T \$CPUS \n1) compressed_VCF\n2) BAM } ## checkings section -if [[ $# -ne 4 ]] ; then usage ; exit -1 ; fi +if [[ $# -ne 4 ]] ; then usage ; exit -1 ; fi for F in ${SCRIPT_GET_CONSPOS} ${VCF} ${TBAM} ; do checkFile ${F} ; done for EXE in samtools bcftools phaser.py ; do check_exe_in_path ${EXE} ; done if [[ ${CPUS} -ge ${MAX_CPUS_IN_CPUINFO} ]] ; then CPUS=$((${MAX_CPUS_IN_CPUINFO}-1)) ; fi @@ -86,6 +86,22 @@ VCF=${VCF_ORIGINAL_INPUT} if [[ 1 == 1 ]] ;then + echo -e "Keeping out the Indels as phASER exclude them from phasing anyway" 1>&2 + ## We had to do this due to encountering an edge case mentioned in issue #27 in github + bcftools filter -O z -i 'TYPE="indel"' -o ${VCF/vcf.gz/indels.vcf.gz} ${VCF} + check_ev $? "bcftools filter out indels" + bcftools index --tbi ${VCF/vcf.gz/indels.vcf.gz} + check_ev $? "bcftools index indels calls" + VCF_INDELS_ONLY=${VCF/vcf.gz/indels.vcf.gz} + + ## now Subsetting the VCF_SBCP to continue without the indels + bcftools filter -O z -e 'TYPE="indel"' -o ${VCF/vcf.gz/noindels.vcf.gz} ${VCF} + check_ev $? "bcftools filter out indels" + bcftools index --tbi ${VCF/vcf.gz/noindels.vcf.gz} + check_ev $? "bcftools index indels calls" + cp ${VCF} ${VCF}.original_input.vcf + cp ${VCF/vcf.gz/noindels.vcf.gz} ${VCF} + echo -e "get consecutive positions ... as tabulated text file for bcftools ..." python3 ${SCRIPT_GET_CONSPOS} ${VCF} check_ev $? "$(basename ${SCRIPT_GET_CONSPOS})" @@ -269,7 +285,7 @@ VCF_IN_UNPHASED=${VCF_ORIGINAL_INPUT/vcf.gz/TempNoConsPos.vcf.gz} VCF_IN_PHASED=${VCF_OUT} ## WE OUTPUT An UNcompressed VCF; VCF_OUT=${VCF_ORIGINAL_INPUT/.vcf.gz/.blocs.vcf} -mycmd="bcftools concat -a -O v ${VCF_IN_UNPHASED} ${VCF_IN_PHASED} | bcftools sort -O v -T ${PWD} --max-mem 2G - > ${VCF_OUT}" +mycmd="bcftools concat -a -O v ${VCF_INDELS_ONLY} ${VCF_IN_UNPHASED} ${VCF_IN_PHASED} | bcftools sort -O v -T ${PWD} --max-mem 2G - > ${VCF_OUT}" echo ${mycmd} eval ${mycmd} check_ev $? "bcftools concat ${VCF_OUT}" @@ -282,4 +298,4 @@ touch Completed_Recomposition_Strelka2_for_VCF_$(basename ${VCF_ORIGINAL_INPUT}) echo "${VCF_OUT}" 2>&1 -exit 0 +exit 0 \ No newline at end of file