Skip to content

Commit

Permalink
add consensus calling option: 'most frequent bases at each pos.'
Browse files Browse the repository at this point in the history
  • Loading branch information
yangao07 committed Apr 21, 2024
1 parent 8d1b1cc commit 769821f
Show file tree
Hide file tree
Showing 9 changed files with 264 additions and 139 deletions.
10 changes: 9 additions & 1 deletion .github/workflows/linux-CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,12 @@ jobs:
submodules: recursive

- name: make
run: make
run: make

- name: Run Test Data
run: ./bin/abpoa ./test_data/test.fa; \
./bin/abpoa ./test_data/heter.fa -d2; \
./bin/abpoa ./test_data/seq.fa -a1; \
./bin/abpoa ./test_data/seq.fa -r5 -a1; \
./bin/abpoa ./test_data./heter.fq -d2 -Q \
./bin/abpoa ./test_data./heter.fq -d2 -Q -a1
8 changes: 4 additions & 4 deletions include/abpoa.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
#define ABPOA_OUT_CONS_FQ 5

#define ABPOA_HB 0
#define ABPOA_HC 1
#define ABPOA_MC 1

#define ABPOA_NONE_VERBOSE 0
#define ABPOA_INFO_VERBOSE 1
Expand Down Expand Up @@ -77,7 +77,7 @@ typedef struct {
uint8_t ret_cigar:1, rev_cigar:1, out_msa:1, out_cons:1, out_gfa:1, out_fq:1, use_read_ids:1, amb_strand:1;
uint8_t use_qv:1, disable_seeding:1, progressive_poa:1;
char *incr_fn, *out_pog;
int align_mode, gap_mode, max_n_cons;
int align_mode, gap_mode, max_n_cons, cons_algrm; // consensus calling algorithm: 0: partial order graph, 1: majority voting
double min_freq; // for multiploid data
int verbose; // to control output msg

Expand All @@ -88,8 +88,8 @@ typedef struct {
typedef struct {
int node_id;
int in_edge_n, in_edge_m, *in_id;
int out_edge_n, out_edge_m, *out_id; int *out_weight;
int *read_weight, n_read, m_read; // weight of each read, valid when use_qv=1
int out_edge_n, out_edge_m, *out_id; int *out_edge_weight; // out_edge_weight: edge-wise weight
int *read_weight, n_read, m_read; // read_weight: read-wise weight, valid when use_qv=1
uint64_t **read_ids; int read_ids_n; // for each edge

int aligned_node_n, aligned_node_m, *aligned_node_id; // mismatch; aligned node will have same rank
Expand Down
24 changes: 15 additions & 9 deletions src/abpoa.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ const struct option abpoa_long_opt [] = {
{ "output", 1, NULL, 'o' },
{ "result", 1, NULL, 'r' },
{ "out-pog", 1, NULL, 'g' },
{ "cons-algrm", 1, NULL, 'a'},
{ "max-num-cons", 1, NULL, 'd', },
{ "min-freq", 1, NULL, 'q', },

Expand All @@ -65,14 +66,14 @@ int abpoa_usage(void)
err_printf("%s: %s \n\n", PROG, DESCRIPTION);
err_printf("Version: %s\t", VERSION);
err_printf("Contact: %s\n\n", CONTACT);
err_printf("Usage: %s [options] <in.fa/fq> > cons.fa/msa.out/abpoa.gfa\n\n", PROG);
err_printf("Usage: %s [options] <in.fa/fq> > cons.fa/msa.fa/abpoa.gfa\n\n", PROG);
err_printf("Options:\n");
err_printf(" Alignment:\n");
err_printf(" -m --aln-mode INT alignment mode [%d]\n", ABPOA_GLOBAL_MODE);
err_printf(" -m --aln-mode INT alignment mode [%d]\n", ABPOA_GLOBAL_MODE);
err_printf(" %d: global, %d: local, %d: extension\n", ABPOA_GLOBAL_MODE, ABPOA_LOCAL_MODE, ABPOA_EXTEND_MODE);
err_printf(" -M --match INT match score [%d]\n", ABPOA_MATCH);
err_printf(" -X --mismatch INT mismatch penalty [%d]\n", ABPOA_MISMATCH);
err_printf(" -t --matrix FILE scoring matrix file, \'-M\' and \'-X\' are not used when \'-t\' is used [Null]\n");
err_printf(" -M --match INT match score [%d]\n", ABPOA_MATCH);
err_printf(" -X --mismatch INT mismatch penalty [%d]\n", ABPOA_MISMATCH);
err_printf(" -t --matrix FILE scoring matrix file, \'-M\' and \'-X\' are not used when \'-t\' is used [Null]\n");
err_printf(" e.g., \'HOXD70.mtx, BLOSUM62.mtx\'\n");
err_printf(" -O --gap-open INT(,INT) gap opening penalty (O1,O2) [%d,%d]\n", ABPOA_GAP_OPEN1, ABPOA_GAP_OPEN2);
err_printf(" -E --gap-ext INT(,INT) gap extension penalty (E1,E2) [%d,%d]\n", ABPOA_GAP_EXT1, ABPOA_GAP_EXT2);
Expand All @@ -84,9 +85,9 @@ int abpoa_usage(void)
err_printf(" for each input sequence, try the reverse complement if the current\n");
err_printf(" alignment score is too low, and pick the strand with a higher score\n");
err_printf(" Adaptive banded DP:\n");
err_printf(" -b --extra-b INT first adaptive banding parameter [%d]\n", ABPOA_EXTRA_B);
err_printf(" -b --extra-b INT first adaptive banding parameter [%d]\n", ABPOA_EXTRA_B);
err_printf(" set b as < 0 to disable adaptive banded DP\n");
err_printf(" -f --extra-f FLOAT second adaptive banding parameter [%.2f]\n", ABPOA_EXTRA_F);
err_printf(" -f --extra-f FLOAT second adaptive banding parameter [%.2f]\n", ABPOA_EXTRA_F);
err_printf(" the number of extra bases added on both sites of the band is\n");
err_printf(" b+f*L, where L is the length of the aligned sequence\n");
// err_printf(" -z --zdrop INT Z-drop score in extension alignment [-1]\n");
Expand All @@ -102,7 +103,8 @@ int abpoa_usage(void)
// err_printf(" -n --par-size minimal partition size [%d]\n", ABPOA_W);

err_printf(" Input/Output:\n");
err_printf(" -Q --use-qual-weight take base quality score from FASTQ input file as graph edge weight [False]\n");
err_printf(" -Q --use-qual-weight take base quality score from FASTQ input file as graph edge weight for consensus calling [False]\n");
err_printf(" effective only when input sequences are in FASTQ format and consensus calling with heaviest bundling\n");
err_printf(" -c --amino-acid input sequences are amino acid (default is nucleotide) [False]\n");
err_printf(" -l --in-list input file is a list of sequence file names [False]\n");
err_printf(" each line is one sequence file containing a set of sequences\n");
Expand All @@ -117,6 +119,9 @@ int abpoa_usage(void)
err_printf(" - %d: graph in GFA format\n", ABPOA_OUT_GFA);
err_printf(" - %d: graph with consensus path in GFA format\n", ABPOA_OUT_CONS_GFA);
err_printf(" - %d: consensus in FASTQ format\n", ABPOA_OUT_CONS_FQ);
err_printf(" -a --cons-algrm INT consensus algorithm [%d]\n", ABPOA_HB);
err_printf(" - %d: heaviest bundling path in partial order graph\n", ABPOA_HB);
err_printf(" - %d: most frequent bases at each position\n", ABPOA_MC);
err_printf(" -d --maxnum-cons INT max. number of consensus sequence to generate [1]\n");
err_printf(" -q --min-freq FLOAT min. frequency of each consensus sequence (only effective when -d/--num-cons > 1) [%.2f]\n", MULTIP_MIN_FREQ);
err_printf(" -g --out-pog FILE dump final alignment graph to FILE (.pdf/.png) [Null]\n\n");
Expand Down Expand Up @@ -151,7 +156,7 @@ int abpoa_main(char *file_fn, int is_list, abpoa_para_t *abpt){

int main(int argc, char **argv) {
int c, m, in_list=0; char *s; abpoa_para_t *abpt = abpoa_init_para();
while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:QSk:w:n:i:clpso:r:g:d:q:hvV:", abpoa_long_opt, NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:QSk:w:n:i:clpso:r:g:a:d:q:hvV:", abpoa_long_opt, NULL)) >= 0) {
switch(c)
{
case 'm': m = atoi(optarg);
Expand Down Expand Up @@ -194,6 +199,7 @@ int main(int argc, char **argv) {
break;
case 'g': abpt->out_pog= strdup(optarg); break;

case 'a': abpt->cons_algrm = atoi(optarg); break;
case 'd': abpt->max_n_cons = atoi(optarg); break;
case 'q': abpt->min_freq = atof(optarg); break;

Expand Down
8 changes: 4 additions & 4 deletions src/abpoa.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
#define ABPOA_OUT_CONS_FQ 5

#define ABPOA_HB 0
#define ABPOA_HC 1
#define ABPOA_MC 1

#define ABPOA_NONE_VERBOSE 0
#define ABPOA_INFO_VERBOSE 1
Expand Down Expand Up @@ -77,7 +77,7 @@ typedef struct {
uint8_t ret_cigar:1, rev_cigar:1, out_msa:1, out_cons:1, out_gfa:1, out_fq:1, use_read_ids:1, amb_strand:1;
uint8_t use_qv:1, disable_seeding:1, progressive_poa:1;
char *incr_fn, *out_pog;
int align_mode, gap_mode, max_n_cons;
int align_mode, gap_mode, max_n_cons, cons_algrm; // consensus calling algorithm: 0: partial order graph, 1: majority voting
double min_freq; // for multiploid data
int verbose; // to control output msg

Expand All @@ -88,8 +88,8 @@ typedef struct {
typedef struct {
int node_id;
int in_edge_n, in_edge_m, *in_id;
int out_edge_n, out_edge_m, *out_id; int *out_weight;
int *read_weight, n_read, m_read; // weight of each read, valid when use_qv=1
int out_edge_n, out_edge_m, *out_id; int *out_edge_weight; // out_edge_weight: edge-wise weight
int *read_weight, n_read, m_read; // read_weight: read-wise weight, valid when use_qv=1
uint64_t **read_ids; int read_ids_n; // for each edge

int aligned_node_n, aligned_node_m, *aligned_node_id; // mismatch; aligned node will have same rank
Expand Down
7 changes: 4 additions & 3 deletions src/abpoa_align.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ abpoa_para_t *abpoa_init_para(void) {
abpt->out_fq = 0; // output consensus sequence in fastq
abpt->out_gfa = 0; // out graph in GFA format
abpt->out_msa = 0; // output msa
abpt->cons_algrm = ABPOA_HB; // consensus calling algorithm
abpt->max_n_cons = 1; // number of max. generated consensus sequence
abpt->min_freq = MULTIP_MIN_FREQ;
abpt->use_read_ids = 0;
Expand Down Expand Up @@ -142,10 +143,10 @@ abpoa_para_t *abpoa_init_para(void) {

void abpoa_post_set_para(abpoa_para_t *abpt) {
abpoa_set_gap_mode(abpt);
if (abpt->out_msa || abpt->out_gfa || abpt->max_n_cons > 1) {
if (abpt->out_msa || abpt->out_gfa || abpt->max_n_cons > 1 || abpt->cons_algrm == ABPOA_MC) {
abpt->use_read_ids = 1;
set_65536_table();
if (abpt->max_n_cons > 1) set_bit_table16();
if (abpt->out_msa || abpt->out_gfa) set_65536_table();
if (abpt->max_n_cons > 1 || abpt->cons_algrm == ABPOA_MC) set_bit_table16();
}
if (abpt->align_mode == ABPOA_LOCAL_MODE) abpt->wb = -1;
int i;
Expand Down
18 changes: 9 additions & 9 deletions src/abpoa_graph.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void abpoa_free_node(abpoa_node_t *node, int n) {
for (i = 0; i < n; ++i) {
if (node[i].in_edge_m > 0) free(node[i].in_id);
if (node[i].out_edge_m > 0) {
free(node[i].out_id); free(node[i].out_weight);
free(node[i].out_id); free(node[i].out_edge_weight);
if (node[i].read_ids_n > 0) {
for (j = 0; j < node[i].out_edge_m; ++j) {
free(node[i].read_ids[j]);
Expand All @@ -51,7 +51,7 @@ abpoa_graph_t *abpoa_realloc_graph_edge(abpoa_graph_t *abg, int io, int id, int
if (edge_m <= 0) {
abg->node[id].out_edge_m = MAX_OF_TWO(abg->node[id].out_edge_n, 1);
abg->node[id].out_id = (int*)_err_malloc(abg->node[id].out_edge_m * sizeof(int));
abg->node[id].out_weight = (int*)_err_malloc(abg->node[id].out_edge_m * sizeof(int));
abg->node[id].out_edge_weight = (int*)_err_malloc(abg->node[id].out_edge_m * sizeof(int));
if (use_read_ids || abg->node[id].read_ids_n > 0) {
abg->node[id].read_ids = (uint64_t**)_err_malloc(abg->node[id].out_edge_m * sizeof(uint64_t*));
if (abg->node[id].read_ids_n > 0) {
Expand All @@ -64,7 +64,7 @@ abpoa_graph_t *abpoa_realloc_graph_edge(abpoa_graph_t *abg, int io, int id, int
} else if (abg->node[id].out_edge_n >= edge_m) {
abg->node[id].out_edge_m = abg->node[id].out_edge_n+1; kroundup32(abg->node[id].out_edge_m);
abg->node[id].out_id = (int*)_err_realloc(abg->node[id].out_id, abg->node[id].out_edge_m * sizeof(int));
abg->node[id].out_weight = (int*)_err_realloc(abg->node[id].out_weight, abg->node[id].out_edge_m * sizeof(int));
abg->node[id].out_edge_weight = (int*)_err_realloc(abg->node[id].out_edge_weight, abg->node[id].out_edge_m * sizeof(int));
if (use_read_ids || abg->node[id].read_ids_n > 0) {
abg->node[id].read_ids = (uint64_t**)_err_realloc(abg->node[id].read_ids, abg->node[id].out_edge_m * sizeof(uint64_t*));
if (abg->node[id].read_ids_n > 0) {
Expand Down Expand Up @@ -253,8 +253,8 @@ void abpoa_BFS_set_node_remain(abpoa_graph_t *abg, int src_id, int sink_id) {
int max_w=-1, max_id=-1;
for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) {
out_id = abg->node[cur_id].out_id[i];
if (abg->node[cur_id].out_weight[i] > max_w) {
max_w = abg->node[cur_id].out_weight[i];
if (abg->node[cur_id].out_edge_weight[i] > max_w) {
max_w = abg->node[cur_id].out_edge_weight[i];
max_id = out_id;
}
}
Expand Down Expand Up @@ -287,7 +287,7 @@ void abpoa_topological_sort(abpoa_graph_t *abg, abpoa_para_t *abpt) {
// fprintf(stderr, "node_n: %d, index_rank_m: %d\n", node_n, abg->index_rank_m);
abg->index_to_node_id = (int*)_err_realloc(abg->index_to_node_id, abg->index_rank_m * sizeof(int));
abg->node_id_to_index = (int*)_err_realloc(abg->node_id_to_index, abg->index_rank_m * sizeof(int));
if (abpt->out_msa || abpt->max_n_cons > 1)
if (abpt->out_msa || abpt->max_n_cons > 1 || abpt->cons_algrm == ABPOA_MC)
abg->node_id_to_msa_rank = (int*)_err_realloc(abg->node_id_to_msa_rank, abg->index_rank_m * sizeof(int));
if (abpt->wb >= 0) {
abg->node_id_to_max_pos_left = (int*)_err_realloc(abg->node_id_to_max_pos_left, abg->index_rank_m * sizeof(int));
Expand Down Expand Up @@ -426,7 +426,7 @@ int abpoa_add_graph_edge(abpoa_graph_t *abg, int from_id, int to_id, int check_e
int i;
for (i = 0; i < out_edge_n; ++i) {
if (abg->node[from_id].out_id[i] == to_id) { // edge exists
abg->node[from_id].out_weight[i] += w; // update weight on existing edge
abg->node[from_id].out_edge_weight[i] += w; // update weight on existing edge
// update label id
edge_exist = 1;
out_edge_i = i;
Expand All @@ -444,7 +444,7 @@ int abpoa_add_graph_edge(abpoa_graph_t *abg, int from_id, int to_id, int check_e
/// out edge
abpoa_realloc_graph_edge(abg, 1, from_id, add_read_id);
abg->node[from_id].out_id[out_edge_n] = to_id;
abg->node[from_id].out_weight[out_edge_n] = w; // initial weight for new edge
abg->node[from_id].out_edge_weight[out_edge_n] = w; // initial weight for new edge
out_edge_i = out_edge_n;
++abg->node[from_id].out_edge_n;
}
Expand Down Expand Up @@ -699,7 +699,7 @@ void abpoa_reset(abpoa_t *ab, abpoa_para_t *abpt, int qlen) {
abg->node_m = abg->index_rank_m = node_m;
abg->index_to_node_id = (int*)_err_realloc(abg->index_to_node_id, node_m * sizeof(int));
abg->node_id_to_index = (int*)_err_realloc(abg->node_id_to_index, node_m * sizeof(int));
if (abpt->out_msa || abpt->max_n_cons > 1)
if (abpt->out_msa || abpt->max_n_cons > 1 || abpt->cons_algrm == ABPOA_MC)
abg->node_id_to_msa_rank = (int*)_err_realloc(abg->node_id_to_msa_rank, node_m * sizeof(int));
if (abpt->wb >= 0) {
abg->node_id_to_max_pos_left = (int*)_err_realloc(abg->node_id_to_max_pos_left, node_m * sizeof(int));
Expand Down
Loading

0 comments on commit 769821f

Please sign in to comment.