Skip to content

Commit

Permalink
Merge pull request #263 from liulab-dfci/barcode_kmercount
Browse files Browse the repository at this point in the history
Prioritize the barcode-wise k-mer count in read sorting
  • Loading branch information
mourisl authored Apr 23, 2024
2 parents f2164d4 + 4d3843e commit 0419fba
Show file tree
Hide file tree
Showing 4 changed files with 263 additions and 76 deletions.
32 changes: 24 additions & 8 deletions KmerCount.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,40 @@

#include "KmerCode.hpp"

#define KCOUNT_HASH_MAX 1000003

class KmerCount
{
private:
std::map<uint64_t, int> *count ;
int kmerLength ;
KmerCode kmerCode ;
int maxReadLen ;
int khashMax ;

int *c ;

int GetHash( uint64_t k )
{
return k % KCOUNT_HASH_MAX ;
return k % khashMax ;
}
public:
KmerCount( int k ): kmerCode( k )
KmerCount( int k, int hmax = 1000003 ): kmerCode( k )
{
kmerLength = k ;
kmerLength = k ;
khashMax = hmax ;
maxReadLen = -1 ;
c = NULL ;
count = new std::map<uint64_t, int>[khashMax] ;
}

KmerCount(const KmerCount &b): kmerCode(b.kmerLength)
{
kmerLength = b.kmerLength ;
khashMax = b.khashMax ;
maxReadLen = -1 ;
c = NULL ;
count = new std::map<uint64_t, int>[KCOUNT_HASH_MAX] ;
count = new std::map<uint64_t, int>[khashMax] ;
}

~KmerCount()
{
if ( c != NULL )
Expand Down Expand Up @@ -97,7 +107,7 @@ class KmerCount
int i, j ;
FILE *fp = fopen( file, "r" ) ;
char *buffer = new char[kmerLength + 1] ;
for ( i = 0 ; i < KCOUNT_HASH_MAX ; ++i )
for ( i = 0 ; i < khashMax ; ++i )
{
for ( std::map<uint64_t, int>::iterator it = count[i].begin() ; it != count[i].end() ; ++it )
{
Expand All @@ -121,6 +131,12 @@ class KmerCount
if ( c == NULL )
c = new int[ sz ] ;
}

void SetBuffer()
{
if (c == NULL && maxReadLen > 0)
c = new int[maxReadLen] ;
}

int GetCount( char *kmer )
{
Expand All @@ -142,7 +158,7 @@ class KmerCount
}


int GetCountStatsAndTrim( char *read, char *qual, int &minCount, int &medianCount, double &avgCount )
int GetCountStatsAndTrim( char *read, char *qual, int &minCount, int &medianCount, float &avgCount )
{
int i, k ;
int sum ;
Expand Down
73 changes: 40 additions & 33 deletions SeqSet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3580,54 +3580,61 @@ class SeqSet
seq.posWeight[i + shift] = seq.posWeight[i] ;

// Lower the weight at the end for original sequence.
for ( i = 0 ; i < 2 ; ++i )
if (barcode == -1 || minKmerCount > 1)
{
if ( i + shift >= len || r[i + shift] == 'N' )
continue ;
// If the current weight is 1, change the consensus to the newly input nucleotide
if (r[i + shift] != newConsensus[i + shift]
&& newConsensus[i + shift] != 'N' // The equal to N case will be handled later
&& seq.posWeight[i + shift].count[ nucToNum[newConsensus[i + shift] - 'A']] == 1)
for ( i = 0 ; i < 2 ; ++i )
{
struct _pair np ;
np.a = i + shift ;
np.b = (int)(r[i + shift]) ;
consensusReplacement.PushBack(np) ;
}
if ( i + shift >= len || r[i + shift] == 'N' )
continue ;
// If the current weight is 1, change the consensus to the newly input nucleotide
if (r[i + shift] != newConsensus[i + shift]
&& newConsensus[i + shift] != 'N' // The equal to N case will be handled later
&& seq.posWeight[i + shift].count[ nucToNum[newConsensus[i + shift] - 'A']] == 1)
{
struct _pair np ;
np.a = i + shift ;
np.b = (int)(r[i + shift]) ;
consensusReplacement.PushBack(np) ;
}

for ( j = 0 ; j < 4 ; ++j )
if ( r[i + shift] != numToNuc[j] && seq.posWeight[i + shift].count[j] > 1 )
--seq.posWeight[i + shift].count[j] ;
for ( j = 0 ; j < 4 ; ++j )
if ( r[i + shift] != numToNuc[j] && seq.posWeight[i + shift].count[j] > 1 )
--seq.posWeight[i + shift].count[j] ;
}
}
seq.posWeight.SetZero( 0, shift ) ;
}

if ( extendedOverlaps[0].readEnd < len - 1 )
{
int start = extendedOverlaps[0].readStart + seq.consensusLen ;
seq.posWeight.SetZero( start, len - extendedOverlaps[0].readEnd - 1 ) ;

// Lower the weight at the end for original sequence.
for ( i = seq.consensusLen - 2 ; i < seq.consensusLen ; ++i )
if (barcode == -1 || minKmerCount > 1)
{
int pos = i - extendedOverlaps[0].seqStart ;
int seqPos = i + shift ;
if ( pos < 0 || r[pos] == 'N' )
continue ;

// If the current weight is 1, change the consensus to the newly input nucleotide
if (r[pos] != newConsensus[seqPos]
&& newConsensus[seqPos] != 'N'
&& seq.posWeight[seqPos].count[ nucToNum[newConsensus[seqPos] - 'A']] == 1)
for ( i = seq.consensusLen - 2 ; i < seq.consensusLen ; ++i )
{
struct _pair np ;
np.a = seqPos ;
np.b = (int)r[pos] ;
consensusReplacement.PushBack(np) ;
}
int pos = i - extendedOverlaps[0].seqStart ;
int seqPos = i + shift ;
if ( pos < 0 || r[pos] == 'N' )
continue ;

for ( j = 0 ; j < 4 ; ++j )
if ( r[pos] != numToNuc[j] && seq.posWeight[seqPos].count[j] > 1 )
--seq.posWeight[seqPos].count[j] ;
// If the current weight is 1, change the consensus to the newly input nucleotide
if (r[pos] != newConsensus[seqPos]
&& newConsensus[seqPos] != 'N'
&& seq.posWeight[seqPos].count[ nucToNum[newConsensus[seqPos] - 'A']] == 1)
{
struct _pair np ;
np.a = seqPos ;
np.b = (int)r[pos] ;
consensusReplacement.PushBack(np) ;
}

for ( j = 0 ; j < 4 ; ++j )
if ( r[pos] != numToNuc[j] && seq.posWeight[seqPos].count[j] > 1 )
--seq.posWeight[seqPos].count[j] ;
}
}
}

Expand Down
Loading

0 comments on commit 0419fba

Please sign in to comment.