From 6366029324e2d01902c4699a11a1b966dcfa3c8f Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 9 Sep 2022 12:04:06 +0100 Subject: [PATCH] Make tabix support CSI indices with large positions. This already worked for SAM and VCF where the SQ and Contig lines indicate the maximum length of a reference sequence. However for BED files this was left as zero, which had the effect of fighting against the user by decreasing n_lvls as we increase min_shift. When unknown, max_ref_len is now an arbitrary large size (100G), but this may produce more levels than are strictly necessary, although this doesn't appear to have negative consequences. Also fixed the misleading error message about CSI being unable to index data. This was perhaps intended to be for mis-specified VCF data where a contig was listed as small but the records were at larger offsets, however it simply lead me up the garden path by categorically stating CSI cannot store such large values. --- hts.c | 6 +++--- tabix.1 | 2 +- tbx.c | 5 ++++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/hts.c b/hts.c index 8b437f2b9..c79d92d99 100644 --- a/hts.c +++ b/hts.c @@ -2354,9 +2354,9 @@ int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) return 0; if (idx->fmt == HTS_FMT_CSI) { - hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos - " cannot be stored in a csi index. " - "Please check headers match the data", + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" " + "cannot be stored in a csi index with these parameters. " + "Please use a larger min_shift or depth", beg, end); } else { hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos diff --git a/tabix.1 b/tabix.1 index 2d403aaea..559ec6e69 100644 --- a/tabix.1 +++ b/tabix.1 @@ -101,7 +101,7 @@ start column. [5] Force to overwrite the index file if it is present. .TP .BI "-m, --min-shift " INT -set minimal interval size for CSI indices to 2^INT [14] +Set minimal interval size for CSI indices to 2^INT [14] .TP .BI "-p, --preset " STR Input format for indexing. Valid values are: gff, bed, sam, vcf. diff --git a/tbx.c b/tbx.c index 3af2c09fb..61d2ccd65 100644 --- a/tbx.c +++ b/tbx.c @@ -321,8 +321,11 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) continue; } if (first == 0) { - if (fmt == HTS_FMT_CSI) + if (fmt == HTS_FMT_CSI) { + if (!max_ref_len) + max_ref_len = (int64_t)100*1024*1024*1024; // 100G default n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len); + } tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); if (!tbx->idx) goto fail; first = 1;