From 31ceff51e392c76ad9bd4897dbfd59028a36249e Mon Sep 17 00:00:00 2001
From: Alexander Predeus <ap41@sanger.ac.uk>
Date: Tue, 9 Aug 2022 13:36:33 +0100
Subject: [PATCH] Update to syntax & using STAR v2.7.10a

---
 README.md                    | 50 ++++++++++++-------
 scripts/bbduk.sh             |  6 +++
 scripts/solo_QC.sh           | 15 ++++--
 scripts/starsolo_10x_auto.sh | 93 +++++++++++++++++++-----------------
 scripts/starsolo_dropseq.sh  | 27 ++++++++---
 scripts/starsolo_indrops.sh  | 33 +++++++++----
 scripts/starsolo_ss2.sh      | 24 ++++++++--
 scripts/starsolo_strt.sh     | 30 ++++++++----
 8 files changed, 184 insertions(+), 94 deletions(-)
 create mode 100755 scripts/bbduk.sh

diff --git a/README.md b/README.md
index 33d74e3..faec6eb 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,14 @@
-# Synchronized processing of bulk and scRNA-seq
+# Wrapper scripts for using `STARsolo` with various types of single cell RNA-seq
 
-These are the scripts used for CellGenIT for synchronized processing of scRNA-seq and bulk RNA-seq. Both use [STAR](https://github.com/alexdobin/STAR) aligner to align reads to the reference genome. 
+These are the scripts used for CellGenIT for uniform processing of scRNA-seq - both 10X and quite a few other types (see below for supported platforms). Additionally, bulk RNA-seq could be processed using a 10X reference in a matched way - this should improve scRNA-seq-to-bulk mapping. All listed methods use [STAR](https://github.com/alexdobin/STAR) aligner to align reads to the reference genome. 
 
 ## Software installation
 
 ### STAR and RSEM versions
 
-`STAR` of version 2.7.9a or above is recommended. The newest update includes the ability to correctly process multi-mapping reads, and adds many important options and bug fixes. 
+`STAR` of version 2.7.9a or above is recommended (2.7.10a is the latest and greatest, as of August'22). The newest update includes the ability to correctly process multi-mapping reads, and adds many important options and bug fixes. 
 
-In order to use settings that closely mimic those of `Cell Ranger` v4 or above (see explanations below, particularly `--clipAdapterType CellRanger4` option), `STAR` needs to be re-compiled from source with `make STAR CXXFLAGS_SIMD="-msse4.2"` (see [this issue](https://github.com/alexdobin/STAR/issues/1218) for more info). If you get the "Illegal instruction" error, that's what you need to do. 
+In order to use settings that closely mimic those of `Cell Ranger` v4 or above (see explanations below, particularly `--clipAdapterType CellRanger4` option), `STAR` needs to be re-compiled from source with `make STAR CXXFLAGS_SIMD="-msse4.2"` (see [this issue](https://github.com/alexdobin/STAR/issues/1218) for more info). If you get the _Illegal instruction_ error, that's what you need to do. 
 
 There's also Martin Prete's awesome `icpc`-compiled version of `STAR` that's being tested right now - stay tuned for the updates. 
 
@@ -38,7 +38,7 @@ All **CellGenIT** pre-made `STAR` references are located in `/nfs/cellgeni/STAR/
 
 ## Processing scRNA-seq with STARsolo
 
-### Reprodicing `Cell Ranger` v4 and above (but much faster)
+### 10X: reprodicing `Cell Ranger` v4 and above (but much faster)
 
 Full scripts with the latest settings are available in `/scripts` (there are several scripts according to 10x chemistry version; e.g. `starsolo_3p_v3.sh` should be used for v3 of 3' 10x, while `starsolo_5p_v2.sh` should be used for v2 of 5'. The scripts contain *many* options that frequently change; some of which will be explained below. In general, commands are tuned in such way that the results with be very close to those of `Cell Ranger` v4 and above. 
 
@@ -52,18 +52,19 @@ Below are the explanations for some of the options (note that 5' experiments **a
 
 | 10X VERSION | BC | UMILEN | STR |
 |:-:|:-:|:-:|:-:|
-| 3' v1 | 737K-april-2014_rc.txt |10 | Forward |
-| 3' v2 | 737K-august-2016.txt |10 | Forward |
-| 3' v3, v3.1 | 3M-february-2018.txt |12 | Forward |
-| 5' v1.1, v2 | 737K-august-2016.txt |10 | Reverse |
+| 3' v1 | 737K-april-2014_rc.txt | 10 | Forward |
+| 3' v2 | 737K-august-2016.txt | 10 | Forward |
+| 3' v3, v3.1 | 3M-february-2018.txt | 12 | Forward |
+| 5' v1.1, v2 | 737K-august-2016.txt | 10 | Reverse |
 | 5' v3 | 737K-august-2016.txt | 12 | Reverse |
+| multiome | 737K-arc-v1.txt | 12 | Forward |
 
 </div>
 
   - `--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30` are options that define UMI collapsing, barcode collapsing, and read clipping algorithms that are closest to ones used by `Cell Ranger`; 
   - `--soloCellFilter EmptyDrops_CR` specifies the cell filtering algorithm used in [EmptyDrops](https://bioconductor.org/packages/release/bioc/html/DropletUtils.html), which is the default algorithm in later versions of `Cell Ranger`; 
   - `--soloFeatures Gene GeneFull Velocyto` output conventional (exon-only) UMI counts, as well as exon+intron UMI counts (analog of `Cell Ranger` premrna option), as well as matrices preprocessed for `Velocyto`; 
-  - `--soloMultiMappers Unique EM` is to count multimappers; 
+  - `--soloMultiMappers Unique EM` is to count multimappers (on by default in v3.0+ of these scripts; does not influence the main output, but creates an additional matrix in `/raw` subdir of `Gene` and `GeneFull`); 
   - `--readFilesCommand zcat` is used if your input fastq files are gzipped;
   - options grouped as `$SORTEDBAM` should be used if you need a genomic bam file; otherwise, use `$NOBAM`.  
 
@@ -76,14 +77,14 @@ STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_
      --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
      --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
      --soloFeatures Gene GeneFull Velocyto \
-     --soloOutFileNames output/ genes.tsv barcodes.tsv matrix.mtx
+     --soloOutFileNames output/ genes.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM
 ```
 
-### Using STARsolo for Smart-seq/Smart-seq2
+### Using STARsolo for SMART-seq/SMART-seq2
 
 For plate-based methods that don't use UMIs (such as [SMART-Seq and SMART-Seq2](https://teichlab.github.io/scg_lib_structs/methods_html/SMART-seq_family.html)), `STARsolo` can be used as well. Fastq files for these methods usually come as separate, paired-end files; all of these should be listed in a *manifest* file - plain text, tab-separated file containing three columns per line: 1) full path to R1; 2) full path to R2; 3) cell name or ID. 
 
-Example of a script used to process Smart-seq2 data can be found in `/scripts/starsolo_ss2.sh`. Actual `STAR` command being run:
+Example of a script used to process Smart-seq2 data can be found in `/scripts/starsolo_ss2.sh`. Key parameters that could be adjusted are `--outFilterScoreMinOverLread 0.3 --outFilterMatchNminOverLread 0.3`; the higher they are, the less permissive is the alignment. Lower values can help you "rescue" a large proportion of reads with high adapter content (see below for adapter trimming). Actual `STAR` command being run:
 
 ```bash
 STAR --runThreadN $CPUS --genomeDir $REF --runDirPerm All_RWX --readFilesCommand zcat $SORTEDBAM \
@@ -93,24 +94,39 @@ STAR --runThreadN $CPUS --genomeDir $REF --runDirPerm All_RWX --readFilesCommand
      --soloFeatures Gene GeneFull --soloOutFileNames output/ genes.tsv barcodes.tsv matrix.mtx
 ```
 
-Sometimes, reads can benefit from trimming adapters, which can be turned on using `--clip3pAdapterSeq <3' adapter sequence>` option. Alternatively, `bbduk.sh` can be used to trim adapters from reads prior to the alignment and quantification.  
+Often, SMART-seq2 reads can benefit from trimming adapters, which can be turned on using `--clip3pAdapterSeq <3' adapter sequence>` option. Alternatively, `bbduk.sh` can be used to trim adapters from reads prior to the alignment and quantification.  
 
 ### Counting the multimapping reads
 
 Default approach used by `Cell Ranger` (and `STARsolo` scripts above) is to discard all reads that map to multiple genomic locations with equal mapping quality. This approach creates a bias in gene expression estimation. Pseudocount-based methods correctly quantify multimapping reads, but generate false counts due to pseudo-alignment errors. These issues are described in good detail [here](https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1). 
 
-If you would like to process multimappers, add the following options: `--soloMultiMappers Uniform EM`. This will generate an extra matrix in the /raw output folders. There will be non-integer numbers in the matrix because of split reads. If the downstream processing requires integers, you can round with a tool of your liking (e.g. `awk`). 
+If you would like to process multimappers, add the following options: `--soloMultiMappers Uniform EM` (on by default in v3.0+ of these scripts). This will generate an extra matrix in the `/raw` output folders. There will be non-integer numbers in the matrix because of split reads. If the downstream processing requires integers, you can round with a tool of your liking (e.g. `awk`). 
+
+As of `STAR` v2.7.10a, multimapper counting still does not work for SMART-seq2 or bulk RNA-seq processing. 
+
+### Running STARsolo on other scRNA-seq platforms 
+
+STARsolo is very flexible and can be used with almost any scRNA-seq method, provided you know the library structure - i.e. where cell barcodes, UMIs, and biological parts of the read are located in the sequencing fragment or reads. A great source of information about scRNA-seq library structures is [this page](https://teichlab.github.io/scg_lib_structs/).
+
+Currently, our scripts directory provides dedicated scripts for
+  - Drop-seq;
+  - inDrops; 
+  - STRT-seq. 
+
+Please contact `CellGenIT` if you need to process an unusual dataset. 
 
 ## Quick evaluation of multiple STARsolo runs
 
-If you've used these scripts to process multiple 10x samples, you can get a quick look at the results by copying `solo_QC.sh` script from this repo to the directory with `STARsolo` output folders, and running
+If you've used these scripts to process multiple 10X samples, you can get a quick look at the results by copying `solo_QC.sh` script from this repo to the directory with `STARsolo` output folders, and running
 
 ```bash
-./solo_QC.sh <output_tag> | column -t 
+./solo_QC.sh | column -t 
 ```
 
 The **output_tag** argument could be any common part of the folder name - e.g. if samples are called SRR124444 .. SRR124534, you can use **SRR124** or **SRR**.
 
+The script is designed for 10X or other droplet-based methods; the output will make a lot less sense for SMART-seq2. 
+
 ## Processing bulk RNA-seq with STAR/RSEM
 
 `RSEM` reference files need to be prepared from genome fasta and GTF using the following command: 
diff --git a/scripts/bbduk.sh b/scripts/bbduk.sh
new file mode 100755
index 0000000..1de48cb
--- /dev/null
+++ b/scripts/bbduk.sh
@@ -0,0 +1,6 @@
+#!/bin/bash 
+
+TAG=$1
+ADAPTERS=/nfs/users/nfs_a/ap41/bbmap/resources/adapters.fa
+
+bbduk.sh in1=${TAG}_1.fastq.gz in2=${TAG}_2.fastq.gz out1=$TAG.bbduk.R1.fastq out2=$TAG.bbduk.R2.fastq ref=$ADAPTERS trimpolya=10 ktrim=r k=23 mink=11 hdist=1 tpe tbo &> $TAG.bbduk.log
diff --git a/scripts/solo_QC.sh b/scripts/solo_QC.sh
index 1cdcb35..f81b8ba 100755
--- a/scripts/solo_QC.sh
+++ b/scripts/solo_QC.sh
@@ -1,6 +1,6 @@
 #!/bin/bash 
 
-echo -e "Sample\tRd_all\tRd_in_cells\tFrc_in_cells\tUMI_in_cells\tCells\tMed_nFeature\tGood_BC\tall_u+m\tall_u\texon_u+m\texon_u\tfull_u+m\tfull_u"
+echo -e "Sample\tRd_all\tRd_in_cells\tFrc_in_cells\tUMI_in_cells\tCells\tMed_nFeature\tGood_BC\tStrand\tall_u+m\tall_u\texon_u+m\texon_u\tfull_u+m\tfull_u"
 
 for i in *
 do
@@ -10,15 +10,20 @@ do
     B=`grep "Reads With Valid Barcodes," $i/output/Gene/Summary.csv | awk -F "," '{print $2}'`
     G1=`grep "Reads Mapped to Genome: Unique+Multiple," $i/output/Gene/Summary.csv | awk -F "," '{print $2}'`
     G2=`grep "Reads Mapped to Genome: Unique," $i/output/Gene/Summary.csv | awk -F "," '{print $2}'`
-    T1=`grep "Reads Mapped to Gene: Unique+Multipe Gene," $i/output/Gene/Summary.csv | awk -F "," '{print $2}'`
-    T2=`grep "Reads Mapped to Gene: Unique Gene," $i/output/Gene/Summary.csv | awk -F "," '{print $2}'`
-    F1=`grep "Reads Mapped to GeneFull: Unique+Multipe GeneFull," $i/output/GeneFull/Summary.csv | awk -F "," '{print $2}'`
+    E1=`grep "Reads Mapped to Gene: Unique+Multip.*e Gene," $i/output/Gene/Summary.csv | awk -F "," '{print $2}'`
+    E2=`grep "Reads Mapped to Gene: Unique Gene," $i/output/Gene/Summary.csv | awk -F "," '{print $2}'`
+    F1=`grep "Reads Mapped to GeneFull: Unique+Multip.*e GeneFull," $i/output/GeneFull/Summary.csv | awk -F "," '{print $2}'`
     F2=`grep "Reads Mapped to GeneFull: Unique GeneFull," $i/output/GeneFull/Summary.csv | awk -F "," '{print $2}'`
     C=`grep "Estimated Number of Cells," $i/output/GeneFull/Summary.csv | awk -F "," '{print $2}'`
     R2=`grep "Unique Reads in Cells Mapped to GeneFull," $i/output/GeneFull/Summary.csv | awk -F "," '{print $2}'`
     CF=`echo $R1 | awk -v v=$R2 '{printf "%.3f\n",v/$1}'`
     R3=`grep "UMIs in Cells," $i/output/GeneFull/Summary.csv | awk -F "," '{print $2}'`
     GC=`grep "Median GeneFull per Cell," $i/output/GeneFull/Summary.csv | awk -F "," '{print $2}'`
-    echo -e "$i\t$R1\t$R2\t$CF\t$R3\t$C\t$GC\t$B\t$G1\t$G2\t$T1\t$T2\t$F1\t$F2"
+    ST=`grep "^soloStrand" $i/Log.out | grep RE-DEFINED | awk '{print $2}'`
+    if [[ $ST == "" ]]
+    then
+      ST="Undef"
+    fi
+    echo -e "$i\t$R1\t$R2\t$CF\t$R3\t$C\t$GC\t$B\t$ST\t$G1\t$G2\t$E1\t$E2\t$F1\t$F2"
   fi
 done
diff --git a/scripts/starsolo_10x_auto.sh b/scripts/starsolo_10x_auto.sh
index bd9eb16..6c5d343 100755
--- a/scripts/starsolo_10x_auto.sh
+++ b/scripts/starsolo_10x_auto.sh
@@ -7,19 +7,27 @@
 TAG=$1
 if [[ $TAG == "" ]]
 then
-  >&2 echo "Usage: ./starsolo_auto.sh <sample_tag>"
+  >&2 echo "Usage: ./starsolo_10x_auto.sh <sample_tag>"
   >&2 echo "(make sure you set the correct REF, FQDIR, and SORTEDBAM/NOBAM variables)"
   exit 1
 fi
 
-CPUS=16      ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
-REF=/nfs/cellgeni/STAR/human/2020A/index  ## choose the appropriate reference 
-FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-XXX/fastqs  ### change to the directory with fastq files/folders
+CPUS=16                                                                ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
+REF=/nfs/cellgeni/STAR/human/2020A/index                               ## choose the appropriate reference 
+WL=/nfs/cellgeni/STAR/whitelists                                       ## directory with all barcode whitelists
+FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-XXX/fastqs  ## directory with your fastq files - can be in subdirs, just make sure tag is unique and greppable (e.g. no Sample1 and Sample 10). 
+
 ## choose one of the two otions, depending on whether you need a BAM file 
 #BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 120000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB GX GN"
 BAM="--outSAMtype None"
 
-###################### DONT CHANGE OPTIONS BELOW THIS LINE ###########################
+###################################################################### DONT CHANGE OPTIONS BELOW THIS LINE ##############################################################################################
+
+if [[ `which samtools` == "" || `which seqtk` == "" || `which STAR` == "" ]]
+then
+  echo "ERROR: Please make sure you have STAR (v2.7.9a or above), samtools, and seqtk installed and available in PATH!"
+  exit 1
+fi
 
 mkdir $TAG && cd $TAG
 
@@ -59,46 +67,41 @@ R1LEN=""
 R2LEN=""
 R1DIS=""
 
-## depending on whether the files are archived or not,  
+
+## randomly subsample 200k reads - let's hope there are at least this many (there should be):
+seqtk sample -s100 $R1F 200000 > test.R1.fastq &
+seqtk sample -s100 $R2F 200000 > test.R2.fastq &
+wait
+
+## see if the original fastq files are archived: 
 if [[ `find $FQDIR/* | grep $TAG | grep "\.gz$"` != "" ]]
 then  
   GZIP="--readFilesCommand zcat"
-  NBC1=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt | wc -l`
-  NBC2=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-august-2016.txt | wc -l`
-  NBC3=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/3M-february-2018.txt | wc -l`
-  NBCA=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt | wc -l`
-  R1LEN=`zcat $R1F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
-  R2LEN=`zcat $R2F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
-  R1DIS=`zcat $R1F | awk 'NR%4==2' | head -n1000 | awk '{print length($0)}' | sort | uniq -c | wc -l`
-  zcat $R1F | head -n 100000 > test.R1.fastq
-  zcat $R2F | head -n 100000 > test.R2.fastq 
-else 
-  NBC1=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt | wc -l`
-  NBC2=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-august-2016.txt | wc -l`
-  NBC3=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/3M-february-2018.txt | wc -l`
-  NBCA=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt | wc -l`
-  R1LEN=`cat $R1F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
-  R2LEN=`cat $R2F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
-  R1DIS=`cat $R1F | awk 'NR%4==2' | head -n1000 | awk '{print length($0)}' | sort | uniq -c | wc -l`
-  cat $R1F | head -n 100000 > test.R1.fastq
-  cat $R2F | head -n 100000 > test.R2.fastq 
 fi
 
+NBC1=`cat test.R1.fastq | awk 'NR%4==2' | grep -F -f $WL/737K-april-2014_rc.txt | wc -l`
+NBC2=`cat test.R1.fastq | awk 'NR%4==2' | grep -F -f $WL/737K-august-2016.txt | wc -l`
+NBC3=`cat test.R1.fastq | awk 'NR%4==2' | grep -F -f $WL/3M-february-2018.txt | wc -l`
+NBCA=`cat test.R1.fastq | awk 'NR%4==2' | grep -F -f $WL/737K-arc-v1.txt | wc -l`
+R1LEN=`cat test.R1.fastq | awk 'NR%4==2' | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
+R2LEN=`cat test.R2.fastq | awk 'NR%4==2' | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
+R1DIS=`cat test.R1.fastq | awk 'NR%4==2' | awk '{print length($0)}' | sort | uniq -c | wc -l`
+
 ## elucidate the right barcode whitelist to use. Grepping out N saves us some trouble. Note the special list for multiome experiments (737K-arc-v1.txt):
-if (( $NBC2 > 5000 )) 
+if (( $NBC2 > 100000 )) 
 then 
-  BC=/nfs/cellgeni/STAR/whitelists/737K-august-2016.txt
-elif (( $NBC3 > 5000 ))
+  BC=$WL/737K-august-2016.txt
+elif (( $NBC3 > 100000 ))
 then
-  BC=/nfs/cellgeni/STAR/whitelists/3M-february-2018.txt
-elif (( $NBCA > 5000 ))
+  BC=$WL/3M-february-2018.txt
+elif (( $NBCA > 100000 ))
 then
-  BC=/nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt
-elif (( $NBC1 > 5000 )) 
+  BC=$WL/737K-arc-v1.txt
+elif (( $NBC1 > 100000 )) 
 then
-  BC=/nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt
+  BC=$WL/737K-april-2014_rc.txt
 else 
-  >&2 echo "ERROR: No whitelist has matched first 10000 barcodes!"
+  >&2 echo "ERROR: No whitelist has matched a random selection of 200,000 barcodes!"
   exit 1
 fi 
 
@@ -114,9 +117,9 @@ elif (( $R1LEN < 24 ))
 then
   >&2 echo "ERROR: Read 1 (barcode) is less than 24 bp in length. Please check the fastq files."
   exit 1
-elif (( $R2LEN < 50 )) 
+elif (( $R2LEN < 40 )) 
 then
-  >&2 echo "ERROR: Read 2 (biological read) is less than 50 bp in length. Please check the fastq files."
+  >&2 echo "ERROR: Read 2 (biological read) is less than 40 bp in length. Please check the fastq files."
   exit 1
 fi
 
@@ -126,7 +129,7 @@ then
   PAIRED=True
   UMILEN=10
   CBLEN=16
-elif (( $NBC1 > 5000 )) 
+elif (( $NBC1 > 100000 )) 
 then
   CBLEN=14
   UMILEN=$((R1LEN-14))
@@ -144,10 +147,12 @@ STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fas
      --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
      --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
      --soloFeatures Gene --soloOutFileNames test_strand/ features.tsv barcodes.tsv matrix.mtx &> /dev/null 
-rm test.R1.fastq test.R2.fastq
 
-GENEPCT=`grep "Reads Mapped to Gene: Unique Gene" test_strand/Gene/Summary.csv | awk -F "," '{printf "%d\n",$2*100}'`
-if (( $GENEPCT < 10 )) 
+## the following is needed in case of bad samples: when a low fraction of reads come from mRNA, experiment will look falsely reverse-stranded
+UNIQFRQ=`grep "Reads Mapped to Genome: Unique," test_strand/Gene/Summary.csv | awk -F "," '{print $2}'`
+GENEPCT=`grep "Reads Mapped to Gene: Unique Gene" test_strand/Gene/Summary.csv | awk -F "," -v v=$UNIQFRQ '{printf "%d\n",$2*100/v}'`
+
+if (( $GENEPCT < 20 )) 
 then
   STRAND=Reverse
 fi
@@ -156,7 +161,7 @@ fi
 if [[ $STRAND == "Forward" && $PAIRED == "True" ]]
 then
   PAIRED=False
-  if [[ $BC == "/nfs/cellgeni/STAR/whitelists/3M-february-2018.txt" ]] 
+  if [[ $BC == "$WL/3M-february-2018.txt" ]] 
   then
     UMILEN=12
   fi
@@ -166,7 +171,7 @@ echo "Done setting up the STARsolo run; here are final processing options:"
 echo "============================================================================="
 echo "Sample: $TAG"
 echo "Paired-end mode: $PAIRED"
-echo "Strand (Forward = 3', Reverse = 5'): $STRAND"
+echo "Strand (Forward = 3', Reverse = 5'): $STRAND, %reads same strand as gene: $GENEPCT"
 echo "CB whitelist: $BC"
 echo "CB length: $CBLEN"
 echo "UMI length: $UMILEN"
@@ -183,13 +188,13 @@ then
      --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \
      --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
      --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 \
-     --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx
+     --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM
 else 
   STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \
      --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \
      --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
      --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
-     --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx
+     --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM
 fi
 
 ## index the BAM file
diff --git a/scripts/starsolo_dropseq.sh b/scripts/starsolo_dropseq.sh
index 1bfe503..6f27021 100755
--- a/scripts/starsolo_dropseq.sh
+++ b/scripts/starsolo_dropseq.sh
@@ -7,19 +7,26 @@
 TAG=$1
 if [[ $TAG == "" ]]
 then
-  >&2 echo "Usage: ./starsolo_auto.sh <sample_tag>"
+  >&2 echo "Usage: ./starsolo_dropseq.sh <sample_tag>"
   >&2 echo "(make sure you set the correct REF, FQDIR, and SORTEDBAM/NOBAM variables)"
   exit 1
 fi
 
-CPUS=16      ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
-REF=/nfs/cellgeni/STAR/human/2020A/index  ## choose the appropriate reference 
-FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-1258/fastq_HCA_Dropseq  ### change to the directory with fastq files/folders
+CPUS=16                                                                ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
+REF=/nfs/cellgeni/STAR/human/2020A/index                               ## choose the appropriate reference 
+FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-XXX/fastqs  ## directory with your fastq files - can be in subdirs, just make sure tag is unique and greppable (e.g. no Sample1 and Sample 10). 
+
 ## choose one of the two otions, depending on whether you need a BAM file 
-BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 120000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB GX GN"
-#BAM="--outSAMtype None"
+#BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 120000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB GX GN"
+BAM="--outSAMtype None"
+
+###################################################################### DONT CHANGE OPTIONS BELOW THIS LINE ##############################################################################################
 
-###################### DONT CHANGE OPTIONS BELOW THIS LINE ###########################
+if [[ `which samtools` == "" || `which STAR` == "" ]]
+then
+  echo "ERROR: Please make sure you have STAR (v2.7.9a or above), samtools, and seqtk installed and available in PATH!"
+  exit 1
+fi
 
 mkdir $TAG && cd $TAG
 
@@ -53,6 +60,12 @@ STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_
      --soloType CB_UMI_Simple --soloCBwhitelist None --soloCBstart 1 --soloCBlen 12 --soloUMIstart 13 --soloUMIlen 8 --soloBarcodeReadLength 0 \
      --soloFeatures Gene GeneFull --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx
 
+## index the BAM file
+if [[ -s Aligned.sortedByCoord.out.bam ]]
+then
+  samtools index -@16 Aligned.sortedByCoord.out.bam
+fi
+
 ## finally, let's gzip all outputs
 cd output
 for i in Gene/raw Gene/filtered GeneFull/raw GeneFull/filtered
diff --git a/scripts/starsolo_indrops.sh b/scripts/starsolo_indrops.sh
index 4f689b1..ccf8da6 100755
--- a/scripts/starsolo_indrops.sh
+++ b/scripts/starsolo_indrops.sh
@@ -7,24 +7,33 @@
 TAG=$1
 if [[ $TAG == "" ]]
 then
-  >&2 echo "Usage: ./starsolo_auto.sh <sample_tag>"
+  >&2 echo "Usage: ./starsolo_indrops.sh <sample_tag>"
   >&2 echo "(make sure you set the correct REF, FQDIR, and SORTEDBAM/NOBAM variables)"
   exit 1
 fi
 
-CPUS=16      ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
-REF=/nfs/cellgeni/STAR/human/2020A/index  ## choose the appropriate reference 
-FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-1258/inDrops_fastqs  ### change to the directory with fastq files/folders
+CPUS=16                                                                ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
+REF=/nfs/cellgeni/STAR/human/2020A/index                               ## choose the appropriate reference 
+WL=/nfs/cellgeni/STAR/whitelists                                       ## directory with all barcode whitelists
+FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-XXX/fastqs  ## directory with your fastq files - can be in subdirs, just make sure tag is unique and greppable (e.g. no Sample1 and Sample 10). 
+
 ## choose one of the two otions, depending on whether you need a BAM file 
-BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 120000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB GX GN"
-#BAM="--outSAMtype None"
+#BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 120000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB GX GN"
+BAM="--outSAMtype None"
+
+###################################################################### DONT CHANGE OPTIONS BELOW THIS LINE ##############################################################################################
+
+if [[ `which samtools` == "" || `which STAR` == "" ]]
+then
+  echo "ERROR: Please make sure you have STAR (v2.7.9a or above) and samtools are installed and available in PATH!"
+  exit 1
+fi
 
-###################### DONT CHANGE OPTIONS BELOW THIS LINE ###########################
 
 mkdir $TAG && cd $TAG
 
-BC1=/nfs/cellgeni/STAR/whitelists/inDrops_Ambrose2_bc1.txt
-BC2=/nfs/cellgeni/STAR/whitelists/inDrops_Ambrose2_bc2.txt
+BC1=$WL/inDrops_Ambrose2_bc1.txt
+BC2=$WL/inDrops_Ambrose2_bc2.txt
 
 R1=""
 R2=""
@@ -66,5 +75,11 @@ do
   cd ../../
 done
 
+## index the BAM file
+if [[ -s Aligned.sortedByCoord.out.bam ]]
+then
+  samtools index -@16 Aligned.sortedByCoord.out.bam
+fi
+
 wait
 echo "ALL DONE!"
diff --git a/scripts/starsolo_ss2.sh b/scripts/starsolo_ss2.sh
index a10af7f..0526f99 100755
--- a/scripts/starsolo_ss2.sh
+++ b/scripts/starsolo_ss2.sh
@@ -13,12 +13,22 @@ then
   exit 1
 fi
 
-CPUS=16      ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
-REF=/nfs/cellgeni/STAR/human/2020A/index  ## choose the proper reference 
-GZIP="--readFilesCommand zcat"
-#BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 60000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM GX GN"
+CPUS=16                                                                ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
+REF=/nfs/cellgeni/STAR/human/2020A/index                               ## choose the appropriate reference 
+FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-XXX/fastqs  ## directory with your fastq files - can be in subdirs, just make sure tag is unique and greppable (e.g. no Sample1 and Sample 10). 
+
+## choose one of the two otions, depending on whether you need a BAM file 
+#BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 120000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB GX GN"
 BAM="--outSAMtype None"
 
+###################################################################### DONT CHANGE OPTIONS BELOW THIS LINE ##############################################################################################
+
+if [[ `which samtools` == "" || `which STAR` == "" ]]
+then
+  echo "ERROR: Please make sure you have STAR (v2.7.9a or above) and samtools are installed and available in PATH!"
+  exit 1
+fi
+
 mkdir $TAG.solo.SS2 && cd $TAG.solo.SS2
 
 ## outFilter* options can be adjusted according to the mapping rate and mapped length
@@ -27,3 +37,9 @@ STAR --runThreadN $CPUS --genomeDir $REF --runDirPerm All_RWX $GZIP $BAM \
      --outFilterScoreMinOverLread 0.3 --outFilterMatchNminOverLread 0.3 \
      --soloType SmartSeq --readFilesManifest ../$TAG.manifest.tsv --soloUMIdedup Exact --soloStrand Unstranded \
      --soloFeatures Gene GeneFull --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx
+
+## index the BAM file
+if [[ -s Aligned.sortedByCoord.out.bam ]]
+then
+  samtools index -@16 Aligned.sortedByCoord.out.bam
+fi
diff --git a/scripts/starsolo_strt.sh b/scripts/starsolo_strt.sh
index 2ffbd7e..7865cca 100755
--- a/scripts/starsolo_strt.sh
+++ b/scripts/starsolo_strt.sh
@@ -13,16 +13,24 @@ then
   exit 1
 fi
 
-CPUS=16      ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
-REF=/nfs/cellgeni/STAR/human/2020A/index  ## choose the appropriate reference 
-FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-1211/GSE142653/fastqs  ### change to the directory with fastq files/folders
-BC=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-1211/GSE142653/96_barcodes.list
-UMILEN=8  ## need to change barcode length too 
-STR=Forward  ## 3' 10x
+CPUS=16                                                                ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
+REF=/nfs/cellgeni/STAR/human/2020A/index                               ## choose the appropriate reference 
+WL=/nfs/cellgeni/STAR/whitelists                                       ## directory with all barcode whitelists
+FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-XXX/fastqs  ## directory with your fastq files - can be in subdirs, just make sure tag is unique and greppable (e.g. no Sample1 and Sample 10). 
 
 ## choose one of the two otions, depending on whether you need a BAM file 
-SORTEDBAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 60000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB GX GN"
-NOBAM="--outSAMtype None"
+#BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 120000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB GX GN"
+BAM="--outSAMtype None"
+
+###################################################################### DONT CHANGE OPTIONS BELOW THIS LINE ##############################################################################################
+
+if [[ `which samtools` == "" || `which STAR` == "" ]]
+then
+  echo "ERROR: Please make sure you have STAR (v2.7.9a or above) and samtools are installed and available in PATH!"
+  exit 1
+fi
+
+BC=$WL/96_barcodes.list
 
 mkdir $TAG && cd $TAG
 ## for multiple fastq files; change grep options according to your fastq file format 
@@ -51,6 +59,12 @@ do
   cd ../../
 done
 
+## index the BAM file
+if [[ -s Aligned.sortedByCoord.out.bam ]]
+then
+  samtools index -@16 Aligned.sortedByCoord.out.bam
+fi
+
 wait
 echo "ALL DONE!"