diff --git a/CHANGES b/CHANGES index c070f365..d98990c4 100644 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,6 @@ -STARlong: fixed --outFilterIntronMotifs and --outSAMstrandField options. +STAR 2.4.2a 2015/06/19 Implemented --quantMode GeneCounts option for counting number of reads per gene, similar to htseq-count. +STARlong: fixed --outFilterIntronMotifs and --outSAMstrandField options. Yet another fix for --sjdbOverhang logic. Error message when shared memory and on the fly junction insertion are used together. Fixed a bug causing unnecessary 1 base soft-clipping in rare cases with sparse suffix array. diff --git a/RELEASEnotes b/RELEASEnotes index 9b73773b..260590d8 100644 --- a/RELEASEnotes +++ b/RELEASEnotes @@ -1,3 +1,26 @@ +STAR 2.4.2a 2015/06/19 + +New features: + +Counting reads per gene while mapping with --quantMode GeneCounts option. +A read is counted if it overlaps (1nt or more) one and only one gene. Both ends of the paired-end read are checked for overlaps. +The counts coincide with those produced by htseq-count with default parameters. + +Requires annotations (GTF or GFF with --sjdbGTFfile option) used at the genome generation step, or at the mapping step. + +Outputs read counts per gene into ReadsPerGene.out.tab file with 4 columns which correspond to different strandedness options: +column 1: gene ID +column 2: counts for unstranded RNA-seq +column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes) +column 4: counts for the 2nd read strand aligned with RNA (htseq-count option -s reverse) +Select the output according to the strandedness of your data. +Note, that if you have stranded data and choose one of the columns 3 or 4, the other column (4 or 3) will give you the count of antisense reads. + +With --quantMode TranscriptomeSAM GeneCounts, and get both the Aligned.toTranscriptome.out.bam and ReadsPerGene.out.tab outputs. + + + +################################################################################################################################################################ STAR 2.4.1a 2015/04/17 New features: diff --git a/bin/MacOSX_x86_64/STAR b/bin/MacOSX_x86_64/STAR index 8cb3a62e..8d0619f3 100755 Binary files a/bin/MacOSX_x86_64/STAR and b/bin/MacOSX_x86_64/STAR differ diff --git a/bin/MacOSX_x86_64/STARlong b/bin/MacOSX_x86_64/STARlong new file mode 100755 index 00000000..4b3ca7e6 Binary files /dev/null and b/bin/MacOSX_x86_64/STARlong differ diff --git a/doc/STARmanual.pdf b/doc/STARmanual.pdf index 90bf19fa..08cdedc7 100644 Binary files a/doc/STARmanual.pdf and b/doc/STARmanual.pdf differ diff --git a/extras/doc-latex/STARmanual.tex b/extras/doc-latex/STARmanual.tex index 35733839..ca3be10e 100644 --- a/extras/doc-latex/STARmanual.tex +++ b/extras/doc-latex/STARmanual.tex @@ -34,7 +34,7 @@ \newcommand{\sechyperref}[1]{\hyperref[#1]{Section \ref{#1}. \nameref{#1}}} -\title{STAR manual 2.4.1a} +\title{STAR manual 2.4.2a} \author{Alexnder Dobin\\ dobin@cshl.edu} \maketitle @@ -153,7 +153,7 @@ \subsubsection{Very small genome.} For small genomes, the parameter \opt{genomeSAindexNbases} needs to be scaled down, with a typical value of \code{min(14, log2(GenomeLength)/2 - 1)}. For example, for 1~megaBase genome, this is equal to 9, for 100~kiloBase genome, this is equal to 7. \subsubsection{Genome with a large number of references.} -If you are using a genome with a large (\textgreater 5,000) number of references (chrosomes/scaffolds), you may need to reduce the \opt{genomeChrBinNbits} to reduce RAM consumption. The following scaling is recomended: \opt{genomeChrBinNbits} = \code{min(18, log2(GenomeLength/NumberOfReferences))}. For example, for 3~gigaBase genome with 100,000 chromosomes/scaffolds, this is equal to 15. +If you are using a genome with a large (\textgreater 5,000) number of references (chrosomes/scaffolds), you may need to reduce the \opt{genomeChrBinNbits} to reduce RAM consumption. The following scaling is recommended: \opt{genomeChrBinNbits} = \code{min(18, log2(GenomeLength/NumberOfReferences))}. For example, for 3~gigaBase genome with 100,000 chromosomes/scaffolds, this is equal to 15. \section{Running mapping jobs.}\label{Running_mapping_jobs} \subsection{Basic options.} @@ -165,7 +165,7 @@ \subsection{Basic options.} \begin{itemize} \item[] -\opt{runThreadN} option defines the number of threads to be used for genome generation, it has to be set to the number of available cores on the server node. +%\opt{runThreadN} option defines the number of threads to be used for mapping, it has to be set to the number of available cores on the server node. \opt{genomeDir} specifies path to the genome directory where genome indices where generated (see \sechyperref{Generating_genome_indexes}). @@ -203,7 +203,7 @@ \subsubsection{ENCODE options} \opt{outFilterMismatchNmax} 999\\ maximum number of mismatches per pair, large number switches off this filter \item[] -\opt{outFilterMismatchNoverLmax} 0.04\\ +%\opt{outFilterMismatchNoverReadLmax} 0.04\\ max number of mismatches per pair relative to read length: for 2x100b, max number of mismatches is 0.06*200=8 for the paired read \item[] \opt{alignIntronMin} 20\\ @@ -367,11 +367,27 @@ \subsection{Chimeric alignments in \ofilen{Chimeric.out.junction}} \section{Output in transcript coordinates.} -With \opt{quantMode} \optv{TranscriptomeSAM} option STAR will outputs alignments translated into transcript coordinates in the \ofilen{Aligned.toTranscriptome.out.bam} file (in addition to alignments in genomic coordinates in \ofilen{Aligned.*.sam/bam} files). These transcriptomic alignments can be used with various transcript quantification software that require reads to be mapped to transcriptome, such as RSEM or eXpress. For example, RSEM command line would look as follows: \codelines{rsem-calculate-expression ... --bam Aligned.toTranscriptome.out.bam /path/to/RSEM/reference RSEM}. +With \opt{quantMode} \optv{TranscriptomeSAM} option STAR will output alignments translated into transcript coordinates in the \ofilen{Aligned.toTranscriptome.out.bam} file (in addition to alignments in genomic coordinates in \ofilen{Aligned.*.sam/bam} files). These transcriptomic alignments can be used with various transcript quantification software that require reads to be mapped to transcriptome, such as RSEM or eXpress. For example, RSEM command line would look as follows: \codelines{rsem-calculate-expression ... --bam Aligned.toTranscriptome.out.bam /path/to/RSEM/reference RSEM}. Note, that STAR first aligns reads to entire genome, and only then searches for concordance between alignments and transcripts. I believe this approach might offer certain advantages compared to the alignment to transcriptome only, by not forcing the alignments to annotated transcripts. By default, the output satisfies RSEM requirements: soft-clipping or indels are not allowed. Use \opt{quantTranscriptomeBan} \optv{Singleend} to allow insertions, deletions ans soft-clips in the transcriptomic alignments, which can be used by some expression quantification software (e.g. eXpress). +\section{Counting number of reads per gene.} +With \opt{quantMode} \optv{GeneCounts} option STAR will count number reads per gene while mapping. +A read is counted if it overlaps (1nt or more) one and only one gene. Both ends of the paired-end read are checked for overlaps. +The counts coincide with those produced by htseq-count with default parameters. +This option requires annotations (GTF or GFF with --sjdbGTFfile option) used at the genome generation step, or at the mapping step. +STAR outputs read counts per gene into ReadsPerGene.out.tab file with 4 columns which correspond to different strandedness options: +\begin{itemize}[leftmargin=1in] +\item[column 1:] gene ID +\item[column 2:] counts for unstranded RNA-seq +\item[column 3:] counts for the 1st read strand aligned with RNA (htseq-count option -s yes) +\item[column 4:] counts for the 2nd read strand aligned with RNA (htseq-count option -s reverse) +\end{itemize} +Select the output according to the strandedness of your data. +Note, that if you have stranded data and choose one of the columns 3 or 4, the other column (4 or 3) will give you the count of antisense reads. +With \opt{quantMode} \optv{TranscriptomeSAM} \optv{GeneCounts}, and get both the \ofilen{Aligned.toTranscriptome.out.bam} and \ofilen{ReadsPerGene.out.tab} outputs. + \section{2-pass mapping.} diff --git a/extras/doc-latex/convertParDefToLatexTable.awk b/extras/doc-latex/convertParDefToLatexTable.awk index fa609821..529c4f2a 100644 --- a/extras/doc-latex/convertParDefToLatexTable.awk +++ b/extras/doc-latex/convertParDefToLatexTable.awk @@ -3,6 +3,7 @@ function substLatexSymbols() { gsub(">","{\\textgreater}"); gsub("<","{\\textless}"); gsub("_","{\\textunderscore}"); + gsub("&","{\\\\&}"); }; BEGIN { diff --git a/extras/doc-latex/parametersDefault.tex b/extras/doc-latex/parametersDefault.tex index 3c765d3a..aa5a846f 100644 --- a/extras/doc-latex/parametersDefault.tex +++ b/extras/doc-latex/parametersDefault.tex @@ -26,6 +26,13 @@ \optName{runThreadN} \optValue{1} \optLine{int: number of threads to run STAR} +\optName{runDirPerm} + \optValue{User{\textunderscore}RWX} + \optLine{string: permissions for the directories created at the run-time. } +\begin{optOptTable} + \optOpt{User{\textunderscore}RWX} \optOptLine{user-read/write/execute} + \optOpt{All{\textunderscore}RWX} \optOptLine{all-read/write/execute (same as chmod 777)} +\end{optOptTable} \end{optTable} \optSection{Genome Parameters}\label{Genome_Parameters} \begin{optTable} @@ -258,7 +265,7 @@ \optLine{int: 0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e. FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.} \optName{outSAMflagAND} \optValue{65535} - \optLine{int: 0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e. FLAG=FLAG \& outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise.} + \optLine{int: 0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e. FLAG=FLAG {\&} outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise.} \optName{outSAMattrRGline} \optValue{-} \optLine{string(s): SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". } diff --git a/source/Makefile b/source/Makefile index 1d4484d5..a52ecc04 100644 --- a/source/Makefile +++ b/source/Makefile @@ -112,6 +112,10 @@ STARforMacStatic : CCFLAGS=-D'COMPILE_FOR_MAC' -I ./Mac_Include/ $(CCFLAGS_main) STARforMacStatic : parametersDefault.xxd $(OBJECTS) $(CXX) -o STAR $(CCFLAGS) $(LDFLAGS_Mac_static) $(OBJECTS) +STARlongForMacStatic : CCFLAGS=-D'COMPILE_FOR_LONG_READS' -D'COMPILE_FOR_MAC' -I ./Mac_Include/ $(CCFLAGS_main) +STARlongForMacStatic : parametersDefault.xxd $(OBJECTS) + $(CXX) -o STARlong $(CCFLAGS) $(LDFLAGS_Mac_static) $(OBJECTS) + # STARforMacGDB : CCFLAGS=-D'COMPILE_FOR_MAC' -I ./Mac_Include/ $(CCFLAGS_gdb) STARforMacGDB : parametersDefault.xxd $(OBJECTS) diff --git a/source/Quantifications.cpp b/source/Quantifications.cpp index 326a729d..c30585a6 100644 --- a/source/Quantifications.cpp +++ b/source/Quantifications.cpp @@ -2,6 +2,10 @@ Quantifications::Quantifications (uint32 nGeIn) { + geneCounts.nType=3; + geneCounts.cAmbig = new uintQ[geneCounts.nType]; + geneCounts.cNone = new uintQ[geneCounts.nType]; + geneCounts.nGe=nGeIn; geneCounts.gCount = new uintQ* [geneCounts.nType]; diff --git a/source/Quantifications.h b/source/Quantifications.h index e76a2266..85d080e7 100644 --- a/source/Quantifications.h +++ b/source/Quantifications.h @@ -8,9 +8,9 @@ class Quantifications { public: struct {//counting reads per gene, similar to HTseq uint32 nGe; //number of genes - static const int nType=3; //number of count types (columns) + int nType; //number of count types (columns) uintQ cMulti; //count multimappers - uintQ cAmbig[nType], cNone[nType];//ambigouous, no-feature + uintQ *cAmbig, *cNone;//ambigouous, no-feature uintQ **gCount; // array of read counts per gene for two strands } geneCounts; diff --git a/source/VERSION b/source/VERSION index 8b13d5dc..c6cf43ba 100644 --- a/source/VERSION +++ b/source/VERSION @@ -1 +1 @@ -#define STAR_VERSION "STAR_2.4.1d_modified" +#define STAR_VERSION "STAR_2.4.2a"