Skip to content

Commit

Permalink
Merge pull request #297 from liulab-dfci/special_gap
Browse files Browse the repository at this point in the history
Add the option "--imgtAddtionalGap" to specify additional gaps positions
  • Loading branch information
mourisl authored Jul 31, 2024
2 parents 045f11a + 1bb0363 commit 78314de
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 18 deletions.
30 changes: 19 additions & 11 deletions Annotator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,13 @@ char usage[] = "./annotator [OPTIONS]:\n"
"\t--barcode: there is barcode information in -a and -r files (default: not set)\n"
"\t--UMI: there is UMI information in -r file (default: not set)\n"
"\t--geneAlignment: output the gene alignment (default: not set)\n"
"\t--airrAlignment: output the aligned sequences to prefix_airr_align.tsv (default: not set)\n"
"\t--airrAlignment: output the aligned sequences to prefix_airr_align.tsv (default: not set)\n"
"\t--noImpute: do not impute CDR3 sequence for TCR (default: not set (impute))\n"
"\t--notIMGT: the receptor genome sequence is not in IMGT format (default: not set(in IMGT format))\n"
"\t--imgtAddtionalGap STRING: description for additional gap codon position in IMGT (0-based), e.g. \"TRAV:7,83\" for mouse (default: no)\n"
"\t--outputCDR3File: output CDR3 file when not using -r option (default: no output)\n"
"\t--needReverseComplement: reverse complement sequences on another strand (default: no)\n"
"\t--outputFormat INT: 0-fasta, 1-AIRR. (default: 0 (fasta))\n"
"\t--outputFormat INT: 0-fasta, 1-AIRR. (default: 0 (fasta))\n"
"\t--readAssignment STRING: output the read assignment to the file (default: no output)\n";

int nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2,
Expand Down Expand Up @@ -61,6 +62,7 @@ static struct option long_options[] = {
{ "fastq", no_argument, 0, 10011 },
{ "airrAlignment", no_argument, 0, 10012 },
{ "outputFormat", required_argument, 0, 10013 },
{ "imgtAdditionalGap", required_argument, 0, 10014},
{ (char *)0, 0, 0, 0}
} ;

Expand Down Expand Up @@ -448,6 +450,7 @@ int main( int argc, char *argv[] )
bool needRC = false ; // need reverse complment
int format = 0 ; // 0-trust4 format. 1-fasta, 2-fastq
int outputFormat = 0 ; //0-fasta, 1-airr
std::string imgtAdditionalGap ;
std::map<std::string, int> barcodeStrToInt ;
std::string assemblyFileName ;

Expand Down Expand Up @@ -532,18 +535,23 @@ int main( int argc, char *argv[] )
{
outputAirrAlignment = true ;
}
else if (c == 10013) // outputFormat
{
outputFormat = atoi(optarg) ;
}
else if (c == 10013) // outputFormat
{
outputFormat = atoi(optarg) ;
}
else if (c == 10014) // --imgtAdditionalGap
{
std::string s(optarg) ;
imgtAdditionalGap = s ;
}
else
{
fprintf( stderr, "%s", usage ) ;
return EXIT_FAILURE ;
}
}

refSet.InputRefFa( buffer, isIMGT ) ;
refSet.InputRefFa( buffer, isIMGT, imgtAdditionalGap.length() > 0 ? imgtAdditionalGap.c_str() : NULL ) ;
//refSet.OutputRef( stdout ) ;
//return 0 ;

Expand All @@ -568,10 +576,10 @@ int main( int argc, char *argv[] )
flrAssembly.Open(assemblyFileName.c_str()) ;
while ( flrAssembly.ReadLine() != NULL )
{
const char *lineBuffer = flrAssembly.GetLinePtr() ;
for (i = 0 ; lineBuffer[i] && lineBuffer[i] != '\n' && lineBuffer[i] != ' ' ; ++i)
buffer[i] = lineBuffer[i] ;
buffer[i] = '\0' ;
const char *lineBuffer = flrAssembly.GetLinePtr() ;
for (i = 0 ; lineBuffer[i] && lineBuffer[i] != '\n' && lineBuffer[i] != ' ' ; ++i)
buffer[i] = lineBuffer[i] ;
buffer[i] = '\0' ;

flrAssembly.ReadLine() ;
char *seq = strdup( flrAssembly.GetLinePtr() ) ;
Expand Down
83 changes: 77 additions & 6 deletions SeqSet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2406,7 +2406,7 @@ class SeqSet
}

// Input some baseline sequence to match against.
void InputRefFa( char *filename, bool isIMGT = false )
void InputRefFa( char *filename, bool isIMGT = false, const char *imgtAdditionalGap = NULL)
{
int i, j, k ;
ReadFiles fa ;
Expand Down Expand Up @@ -2562,18 +2562,65 @@ class SeqSet
if (sum == 0 || chainVMotifShiftCount[i][0] > sum / 2)
continue ;

SimpleVector<int> additionalGaps ;
if (imgtAdditionalGap != NULL)
{
int l ;
for (j = 0 ; imgtAdditionalGap[j] ;)
{
for (l = 0 ; l < 4 && imgtAdditionalGap[j + l] ; ++l)
if (imgtAdditionalGap[j + l] != chainName[i][l])
break ;
if (l >= 4)
break ;

for (; imgtAdditionalGap[j] && imgtAdditionalGap[j] != ';'; ++j)
break ;

if (imgtAdditionalGap[j]) // point to ';'
++j ;
}

// e.g. TRAV:7,83
j += 5 ;
int n = 0 ;
for (; imgtAdditionalGap[j] ; ++j)
{
if (imgtAdditionalGap[j] >= '0' && imgtAdditionalGap[j] <= '9')
n = n * 10 + imgtAdditionalGap[j] - '0' ;
else
{
additionalGaps.PushBack(n) ;
n = 0 ;
if (imgtAdditionalGap[j] == ';')
break ;
}
}
if (n != 0)
additionalGaps.PushBack(n) ;
}

int shift = 0 ;
for (j = 1 ; j < 5 ; ++j)
if (chainVMotifShiftCount[i][j] > sum / 2)
break ;
shift = j ;
bool additionalGapValid = false ;
if (shift < 5)
{
PrintLog("WARNING: IMGT may introduce %d bp speical gaps in %s. Will not annotate the CDR1 and CDR2 information for this chain.", 3 * shift, chainName[i]) ;
if (additionalGaps.Size() == shift)
{
additionalGapValid = true ;
PrintLog("WARNING: IMGT may introduce %d bp additional gaps in %s. Will use --imgtAdditiona adjustment to get CDR1 and CDR2 information for this chain.", 3 * shift, chainName[i]) ;
}
else
{
PrintLog("WARNING: IMGT may introduce %d bp additional gaps in %s, and the size does not match --imgtAdditionalGap (if provided). Will not annotate the CDR1 and CDR2 information for this chain.", 3 * shift, chainName[i]) ;
}
}
else
{
PrintLog("WARNING: IMGT may introduce speical gaps in %s and the gaps' total length cannot be determined. Will use the motif information for CDR3 inference and will not annotate the CDR1 and CDR2 information for this chain.", chainName[i]) ;
PrintLog("WARNING: IMGT may introduce additional gaps in %s and the gaps' total length cannot be determined. Will use the motif information for CDR3 inference and will not annotate the CDR1 and CDR2 information for this chain.", chainName[i]) ;
}

int seqCnt = seqs.size() ;
Expand All @@ -2583,9 +2630,33 @@ class SeqSet
if (GetChainType(sw.name) != i || GetGeneType(sw.name) != 0)
continue ;
//fprintf(stderr, "adjust %s\n", sw.name) ;

sw.info[0].a = sw.info[0].b = -1 ;
sw.info[1].a = sw.info[1].b = -1 ;

if (additionalGapValid)
{
int k, l ;
int size = additionalGaps.Size() ;
for (k = 0 ; k <= 1 ; ++k)
{
int aAdjust = 0 ;
int bAdjust = 0 ;
for (l = 0 ; l < size ; ++l)
{
if (sw.info[0].a > additionalGaps[l] * 3)
aAdjust += 3 ;
if (sw.info[0].b > additionalGaps[l] * 3)
bAdjust += 3 ;
}

sw.info[k].a += aAdjust ;
sw.info[k].b += bAdjust ;
}
}
else
{
sw.info[0].a = sw.info[0].b = -1 ;
sw.info[1].a = sw.info[1].b = -1 ;
}

if (shift < 5)
{
sw.info[2].a += 3 * shift ;
Expand Down
1 change: 1 addition & 0 deletions mouse/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
IMGT has additional gaps on the TRAV genes so the conventional coordinate system does not apply. If you need the CDR1 and CDR2 information on the TRAV genes, please add the option "--imgtAddtionalGap TRAV:7,83".
8 changes: 7 additions & 1 deletion run-trust4
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use Cwd qw(cwd abs_path) ;
use File::Basename ;
use File::Path qw(make_path) ;

my $version = "v1.1.3-r523" ;
my $version = "v1.1.3-r526" ;

die "TRUST4 $version usage: ./run-trust4 [OPTIONS]:\n".
"Required:\n".
Expand Down Expand Up @@ -39,6 +39,7 @@ die "TRUST4 $version usage: ./run-trust4 [OPTIONS]:\n".
"\t--skipMateExtension: do not extend assemblies with mate information, useful for SMART-seq (default: not used)\n".
"\t--abnormalUnmapFlag: the flag in BAM for the unmapped read-pair is nonconcordant (default: not set)\n".
"\t--noExtraction: directly use the files from provided -1 -2/-u to assemble (default: extraction first)\n".
"\t--imgtAdditionalGap STRING: description for additional gap codon position in IMGT (0-based), e.g. \"TRAV:7,83\" for mouse (default: no)\n".
"\t--assembleWithRef: conduct the assembly with --ref file (default: use -f file)\n".
"\t--outputReadAssignment: output read assignment results to the prefix_assign.out file (default: no output)\n".
"\t--stage INT: start TRUST4 on specified stage (default: 0):\n".
Expand Down Expand Up @@ -288,6 +289,11 @@ for ( $i = 0 ; $i < @ARGV ; ++$i )
{
$assembleWithRef = 1 ;
}
elsif ($ARGV[$i] eq "--imgtAdditionalGap")
{
$annotatorArgs .= " --imgtAdditionalGap ".$ARGV[$i + 1] ;
++$i ;
}
else
{
die "Unknown parameter ".$ARGV[$i]."\n" ;
Expand Down

0 comments on commit 78314de

Please sign in to comment.