Merge pull request #263 from liulab-dfci/barcode_kmercount

Prioritize the barcode-wise k-mer count in read sorting
liulab-dfci · Apr 23, 2024 · 0419fba · 0419fba
2 parents f2164d4 + 4d3843e
commit 0419fba
Show file tree

Hide file tree

Showing 4 changed files with 263 additions and 76 deletions.
diff --git a/KmerCount.hpp b/KmerCount.hpp
@@ -8,30 +8,40 @@
 
 #include "KmerCode.hpp"
 
-#define KCOUNT_HASH_MAX 1000003
-
 class KmerCount
 {
 private:
 	std::map<uint64_t, int> *count ;
 	int kmerLength ;
 	KmerCode kmerCode ;
 	int maxReadLen ;
+	int khashMax ;
 
 	int *c ;
 
 	int GetHash( uint64_t k )
 	{
-		return k % KCOUNT_HASH_MAX ;
+		return k % khashMax ;
 	}
 public:
-	KmerCount( int k ): kmerCode( k ) 
+	KmerCount( int k, int hmax = 1000003 ): kmerCode( k )
 	{ 
-		kmerLength = k ; 
+		kmerLength = k ;
+		khashMax = hmax ;
+		maxReadLen = -1 ;
+		c = NULL ;
+		count = new std::map<uint64_t, int>[khashMax] ;
+	}
+
+	KmerCount(const KmerCount &b): kmerCode(b.kmerLength)
+	{
+		kmerLength = b.kmerLength ;
+		khashMax = b.khashMax ;
 		maxReadLen = -1 ;
 		c = NULL ;
-		count = new std::map<uint64_t, int>[KCOUNT_HASH_MAX] ;
+		count = new std::map<uint64_t, int>[khashMax] ;
 	}
+
 	~KmerCount() 
 	{
 		if ( c != NULL )
@@ -97,7 +107,7 @@ class KmerCount
 		int i, j ;
 		FILE *fp = fopen( file, "r" ) ;
 		char *buffer = new char[kmerLength + 1] ;
-		for ( i = 0 ; i < KCOUNT_HASH_MAX ; ++i )	
+		for ( i = 0 ; i < khashMax ; ++i )	
 		{
 			for ( std::map<uint64_t, int>::iterator it = count[i].begin() ; it != count[i].end() ; ++it )
 			{
@@ -121,6 +131,12 @@ class KmerCount
 		if ( c == NULL )
 			 c = new int[ sz ] ;
 	}
+
+	void SetBuffer()
+	{
+		if (c == NULL && maxReadLen > 0)
+			c = new int[maxReadLen] ;
+	}
 
 	int GetCount( char *kmer )
 	{
@@ -142,7 +158,7 @@ class KmerCount
 	}
 
 
-	int GetCountStatsAndTrim( char *read, char *qual, int &minCount, int &medianCount, double &avgCount )
+	int GetCountStatsAndTrim( char *read, char *qual, int &minCount, int &medianCount, float &avgCount )
 	{
 		int i, k ;
 		int sum ;

diff --git a/SeqSet.hpp b/SeqSet.hpp
@@ -3580,54 +3580,61 @@ class SeqSet
 							seq.posWeight[i + shift] = seq.posWeight[i] ;
 
 						// Lower the weight at the end for original sequence.
-						for ( i = 0 ; i < 2 ; ++i )
+						if (barcode == -1 || minKmerCount > 1)
 						{
-							if ( i + shift >= len || r[i + shift] == 'N' )
-								continue ;
-							// If the current weight is 1, change the consensus to the newly input nucleotide
-							if (r[i + shift] != newConsensus[i + shift]
-									&& newConsensus[i + shift] != 'N' // The equal to N case will be handled later 
-									&& seq.posWeight[i + shift].count[ nucToNum[newConsensus[i + shift] - 'A']] == 1)
+							for ( i = 0 ; i < 2 ; ++i )
 							{
-								struct _pair np ;
-								np.a = i + shift ;
-								np.b = (int)(r[i + shift]) ;
-								consensusReplacement.PushBack(np) ;	
-							}
+								if ( i + shift >= len || r[i + shift] == 'N' )
+									continue ;
+								// If the current weight is 1, change the consensus to the newly input nucleotide
+								if (r[i + shift] != newConsensus[i + shift]
+										&& newConsensus[i + shift] != 'N' // The equal to N case will be handled later 
+										&& seq.posWeight[i + shift].count[ nucToNum[newConsensus[i + shift] - 'A']] == 1)
+								{
+									struct _pair np ;
+									np.a = i + shift ;
+									np.b = (int)(r[i + shift]) ;
+									consensusReplacement.PushBack(np) ;	
+								}
 
-							for ( j = 0 ; j < 4 ; ++j )
-								if ( r[i + shift] != numToNuc[j] && seq.posWeight[i + shift].count[j] > 1 )
-									--seq.posWeight[i + shift].count[j] ;
+								for ( j = 0 ; j < 4 ; ++j )
+									if ( r[i + shift] != numToNuc[j] && seq.posWeight[i + shift].count[j] > 1 )
+										--seq.posWeight[i + shift].count[j] ;
+							}
 						}
 						seq.posWeight.SetZero( 0, shift ) ;
 					}
+
 					if ( extendedOverlaps[0].readEnd < len - 1 )
 					{
 						int start = extendedOverlaps[0].readStart + seq.consensusLen ;
 						seq.posWeight.SetZero( start, len - extendedOverlaps[0].readEnd - 1 ) ;
 
 						// Lower the weight at the end for original sequence.
-						for ( i = seq.consensusLen - 2 ; i < seq.consensusLen ; ++i )
+						if (barcode == -1 || minKmerCount > 1)
 						{
-							int pos = i - extendedOverlaps[0].seqStart ;
-							int seqPos = i + shift ;
-							if ( pos < 0 || r[pos] == 'N' )
-								continue ;
-
-							// If the current weight is 1, change the consensus to the newly input nucleotide
-							if (r[pos] != newConsensus[seqPos]
-									&& newConsensus[seqPos] != 'N' 
-									&& seq.posWeight[seqPos].count[ nucToNum[newConsensus[seqPos] - 'A']] == 1)
+							for ( i = seq.consensusLen - 2 ; i < seq.consensusLen ; ++i )
 							{
-								struct _pair np ;
-								np.a = seqPos ;
-								np.b = (int)r[pos] ;
-								consensusReplacement.PushBack(np) ;
-							}
+								int pos = i - extendedOverlaps[0].seqStart ;
+								int seqPos = i + shift ;
+								if ( pos < 0 || r[pos] == 'N' )
+									continue ;
 
-							for ( j = 0 ; j < 4 ; ++j )
-								if ( r[pos] != numToNuc[j] && seq.posWeight[seqPos].count[j] > 1 )
-									--seq.posWeight[seqPos].count[j] ;
+								// If the current weight is 1, change the consensus to the newly input nucleotide
+								if (r[pos] != newConsensus[seqPos]
+										&& newConsensus[seqPos] != 'N' 
+										&& seq.posWeight[seqPos].count[ nucToNum[newConsensus[seqPos] - 'A']] == 1)
+								{
+									struct _pair np ;
+									np.a = seqPos ;
+									np.b = (int)r[pos] ;
+									consensusReplacement.PushBack(np) ;
+								}
+
+								for ( j = 0 ; j < 4 ; ++j )
+									if ( r[pos] != numToNuc[j] && seq.posWeight[seqPos].count[j] > 1 )
+										--seq.posWeight[seqPos].count[j] ;
+							}
 						}
 					}