Skip to content

Commit

Permalink
Fix compression of data blocks close to 2GB in size. (PR #55)
Browse files Browse the repository at this point in the history
There is still a limit of 2GB though, which is now tested for on entry.

Reported by Divon Lan
  • Loading branch information
jkbonfield authored Jul 14, 2022
1 parent da99342 commit 0523328
Show file tree
Hide file tree
Showing 16 changed files with 85 additions and 30 deletions.
5 changes: 5 additions & 0 deletions htscodecs/arith_dynamic.c
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,11 @@ unsigned char *arith_compress_to(unsigned char *in, unsigned int in_size,
unsigned int c_meta_len;
uint8_t *rle = NULL, *packed = NULL;

if (in_size > INT_MAX) {
*out_size = 0;
return NULL;
}

if (!out) {
*out_size = arith_compress_bound(in_size, order);
if (!(out = malloc(*out_size)))
Expand Down
5 changes: 5 additions & 0 deletions htscodecs/fqzcomp_qual.c
Original file line number Diff line number Diff line change
Expand Up @@ -1478,6 +1478,11 @@ unsigned char *uncompress_block_fqz2f(fqz_slice *s,

char *fqz_compress(int vers, fqz_slice *s, char *in, size_t uncomp_size,
size_t *comp_size, int strat, fqz_gparams *gp) {
if (uncomp_size > INT_MAX) {
*comp_size = 0;
return NULL;
}

return (char *)compress_block_fqz2f(vers, strat, s, (unsigned char *)in,
uncomp_size, comp_size, gp);
}
Expand Down
9 changes: 7 additions & 2 deletions htscodecs/rANS_static.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size,
if (!out_buf)
return NULL;

ptr = out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
ptr = out_end = out_buf + (uint32_t)(1.05*in_size) + 257*257*3 + 9;

// Compute statistics
hist8(in, in_size, (uint32_t *)F);
Expand Down Expand Up @@ -401,7 +401,7 @@ unsigned char *rans_compress_O1(unsigned char *in, unsigned int in_size,
out_buf = malloc(1.05*in_size + 257*257*3 + 9);
if (!out_buf) goto cleanup;

out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
out_end = out_buf + (uint32_t)(1.05*in_size) + 257*257*3 + 9;
cp = out_buf+9;

hist1_4(in, in_size, (uint32_t (*)[256])F, (uint32_t *)T);
Expand Down Expand Up @@ -815,6 +815,11 @@ unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size,
*/
unsigned char *rans_compress(unsigned char *in, unsigned int in_size,
unsigned int *out_size, int order) {
if (in_size > INT_MAX) {
*out_size = 0;
return NULL;
}

return order
? rans_compress_O1(in, in_size, out_size)
: rans_compress_O0(in, in_size, out_size);
Expand Down
7 changes: 4 additions & 3 deletions htscodecs/rANS_static16_int.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size,
unsigned char *rans_uncompress_O0_4x16(unsigned char *in, unsigned int in_size,
unsigned char *out, unsigned int out_sz);

int rans_compute_shift(uint32_t *F0, uint32_t (*F)[256], uint32_t *T, int *S);
int rans_compute_shift(uint32_t *F0, uint32_t (*F)[256], uint32_t *T,
uint32_t *S);

// Rounds to next power of 2.
// credit to http://graphics.stanford.edu/~seander/bithacks.html
Expand Down Expand Up @@ -362,7 +363,7 @@ static inline int encode_freq1(uint8_t *in, uint32_t in_size, int Nway,

// Decide between 10-bit and 12-bit freqs.
// Fills out S[] to hold the new scaled maximum value.
int S[256] = {0};
uint32_t S[256] = {0};
int shift = rans_compute_shift(T, F, T, S);

// Normalise so T[i] == TOTFREQ_O1
Expand All @@ -372,7 +373,7 @@ static inline int encode_freq1(uint8_t *in, uint32_t in_size, int Nway,
if (T[i] == 0)
continue;

int max_val = S[i];
uint32_t max_val = S[i];
if (shift == TF_SHIFT_O1_FAST && max_val > TOTFREQ_O1_FAST)
max_val = TOTFREQ_O1_FAST;

Expand Down
5 changes: 3 additions & 2 deletions htscodecs/rANS_static32x16pr_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in,
uint32_t F[256+MAGIC] = {0};
int i, j, tab_size = 0, x, z;
// -20 for order/size/meta
int bound = rans_compress_bound_4x16(in_size,0)-20;
uint32_t bound = rans_compress_bound_4x16(in_size,0)-20;

if (!out) {
*out_size = bound;
Expand Down Expand Up @@ -693,7 +693,8 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
unsigned char *out, unsigned int *out_size) {
unsigned char *cp, *out_end, *out_free = NULL;
unsigned int tab_size;
int bound = rans_compress_bound_4x16(in_size,1)-20, z;
uint32_t bound = rans_compress_bound_4x16(in_size,1)-20;
int z;
RansState ransN[NX] __attribute__((aligned(32)));

if (in_size < NX) // force O0 instead
Expand Down
5 changes: 3 additions & 2 deletions htscodecs/rANS_static32x16pr_avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
uint32_t F[256+MAGIC] = {0};
int i, j, tab_size = 0, x, z;
// -20 for order/size/meta
int bound = rans_compress_bound_4x16(in_size,0)-20;
uint32_t bound = rans_compress_bound_4x16(in_size,0)-20;

if (!out) {
*out_size = bound;
Expand Down Expand Up @@ -444,7 +444,8 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
unsigned int *out_size) {
unsigned char *cp, *out_end, *out_free = NULL;
unsigned int tab_size;
int bound = rans_compress_bound_4x16(in_size,1)-20, z;
uint32_t bound = rans_compress_bound_4x16(in_size,1)-20;
int z;
RansState ransN[32] __attribute__((aligned(64)));

if (in_size < 32) // force O0 instead
Expand Down
6 changes: 4 additions & 2 deletions htscodecs/rANS_static32x16pr_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ unsigned char *rans_compress_O0_32x16_neon(unsigned char *in,
uint8_t* ptr;
uint32_t F[256+MAGIC] = {0};
int i, j, tab_size = 0, x, z;
int bound = rans_compress_bound_4x16(in_size,0)-20; // -20 for order/size/meta
// -20 for order/size/meta
uint32_t bound = rans_compress_bound_4x16(in_size,0)-20;

if (!out) {
*out_size = bound;
Expand Down Expand Up @@ -912,7 +913,8 @@ unsigned char *rans_compress_O1_32x16_neon(unsigned char *in,
unsigned int *out_size) {
unsigned char *cp, *out_end, *out_free = NULL;
unsigned int tab_size;
int bound = rans_compress_bound_4x16(in_size,1)-20, z;
uint32_t bound = rans_compress_bound_4x16(in_size,1)-20;
int z;
RansState ransN[NX];

if (in_size < NX) // force O0 instead
Expand Down
3 changes: 2 additions & 1 deletion htscodecs/rANS_static32x16pr_sse4.c
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,8 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in,
uint8_t* ptr;
uint32_t F[256+MAGIC] = {0};
int i, j, tab_size = 0, x, z;
int bound = rans_compress_bound_4x16(in_size,0)-20; // -20 for order/size/meta
// -20 for order/size/meta
uint32_t bound = rans_compress_bound_4x16(in_size,0)-20;

if (!out) {
*out_size = bound;
Expand Down
22 changes: 15 additions & 7 deletions htscodecs/rANS_static4x16pr.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size,
uint8_t* ptr;
uint32_t F[256+MAGIC] = {0};
int i, j, tab_size = 0, rle, x;
int bound = rans_compress_bound_4x16(in_size,0)-20; // -20 for order/size/meta
// -20 for order/size/meta
uint32_t bound = rans_compress_bound_4x16(in_size,0)-20;

if (!out) {
*out_size = bound;
Expand Down Expand Up @@ -345,15 +346,16 @@ unsigned char *rans_uncompress_O0_4x16(unsigned char *in, unsigned int in_size,
// 10 bit means smaller memory footprint when decoding and
// more speed due to cache hits, but it *may* be a poor
// compression fit.
int rans_compute_shift(uint32_t *F0, uint32_t (*F)[256], uint32_t *T, int *S) {
int rans_compute_shift(uint32_t *F0, uint32_t (*F)[256], uint32_t *T,
uint32_t *S) {
int i, j;

double e10 = 0, e12 = 0;
int max_tot = 0;
for (i = 0; i < 256; i++) {
if (F0[i] == 0)
continue;
int max_val = round2(T[i]);
unsigned int max_val = round2(T[i]);
int ns = 0;
#define MAX(a,b) ((a)>(b)?(a):(b))

Expand Down Expand Up @@ -414,8 +416,9 @@ unsigned char *rans_compress_O1_4x16(unsigned char *in, unsigned int in_size,
unsigned char *out, unsigned int *out_size) {
unsigned char *cp, *out_end, *out_free = NULL;
unsigned int tab_size;

int bound = rans_compress_bound_4x16(in_size,1)-20; // -20 for order/size/meta

// -20 for order/size/meta
uint32_t bound = rans_compress_bound_4x16(in_size,1)-20;

if (!out) {
*out_size = bound;
Expand Down Expand Up @@ -1094,9 +1097,14 @@ unsigned char *(*rans_dec_func(int do_simd, int order))
*
* Smallest is method, <in_size> <input>, so worst case 2 bytes longer.
*/
unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size,
unsigned char *out, unsigned int *out_size,
unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size,
unsigned char *out,unsigned int *out_size,
int order) {
if (in_size > INT_MAX) {
*out_size = 0;
return NULL;
}

unsigned int c_meta_len;
uint8_t *meta = NULL, *rle = NULL, *packed = NULL;
uint8_t *out_free = NULL;
Expand Down
5 changes: 5 additions & 0 deletions htscodecs/tokenise_name3.c
Original file line number Diff line number Diff line change
Expand Up @@ -1333,6 +1333,11 @@ uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith,
int *out_len, int *last_start_p) {
int last_start = 0, i, j, nreads;

if (len < 0) {
*out_len = 0;
return NULL;
}

// Count lines
for (nreads = i = 0; i < len; i++)
if (blk[i] <= '\n') // \n or \0 separated entries
Expand Down
6 changes: 5 additions & 1 deletion tests/arith_dynamic_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
// Room to allow for expanded BLK_SIZE on worst case compression.
#define BLK_SIZE2 ((105LL*BLK_SIZE)/100)

static unsigned char in_buf[BLK_SIZE2+257*257*3];
static unsigned char *in_buf;

// Max 4GB
static unsigned char *load(FILE *infp, uint32_t *lenp) {
Expand Down Expand Up @@ -87,6 +87,8 @@ int main(int argc, char **argv) {
struct timeval tv1, tv2, tv3, tv4;
size_t bytes = 0, raw = 0;

in_buf = malloc(BLK_SIZE2+257*257*3);

#ifdef _WIN32
_setmode(_fileno(stdin), _O_BINARY);
_setmode(_fileno(stdout), _O_BINARY);
Expand Down Expand Up @@ -292,5 +294,7 @@ int main(int argc, char **argv) {
tv2.tv_usec - tv1.tv_usec,
(double)bytes / ((long)(tv2.tv_sec - tv1.tv_sec)*1000000 +
tv2.tv_usec - tv1.tv_usec));

free(in_buf);
return 0;
}
4 changes: 3 additions & 1 deletion tests/entropy.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@
#include "htscodecs/rANS_static.h"
#include "htscodecs/rANS_static4x16.h"

#define BLK_SIZE 1024*1024
#ifndef BLK_SIZE
# define BLK_SIZE 1024*1024
#endif

// Max 4GB
static unsigned char *load(FILE *infp, uint32_t *lenp) {
Expand Down
7 changes: 4 additions & 3 deletions tests/fqzcomp_qual_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,9 @@ static unsigned char *load(char *fn, size_t *lenp) {
return data;
}

#define BLK_SIZE 300*1000000
//#define BLK_SIZE 100*100000
#ifndef BLK_SIZE
# define BLK_SIZE 300*1000000
#endif

int count_lines(unsigned char *in, size_t len) {
size_t i;
Expand Down Expand Up @@ -320,7 +321,7 @@ int main(int argc, char **argv) {
int decomp = 0, vers = 4; // CRAM version 4.0 (4) or 3.1 (3)
int strat = 0, raw = 0;
fqz_gparams *gp = NULL, gp_local;
int blk_size = BLK_SIZE; // MAX
uint32_t blk_size = BLK_SIZE; // MAX

#ifdef _WIN32
_setmode(_fileno(stdin), _O_BINARY);
Expand Down
6 changes: 5 additions & 1 deletion tests/rANS_static4x16pr_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
// Room to allow for expanded BLK_SIZE on worst case compression.
#define BLK_SIZE2 ((105LL*BLK_SIZE)/100)

static unsigned char in_buf[BLK_SIZE2+257*257*3];
unsigned char *in_buf;

// Max 4GB
static unsigned char *load(FILE *infp, uint32_t *lenp) {
Expand Down Expand Up @@ -87,6 +87,8 @@ int main(int argc, char **argv) {
struct timeval tv1, tv2, tv3, tv4;
size_t bytes = 0, raw = 0;

in_buf = malloc(BLK_SIZE2+257*257*3);

#ifdef _WIN32
_setmode(_fileno(stdin), _O_BINARY);
_setmode(_fileno(stdout), _O_BINARY);
Expand Down Expand Up @@ -332,5 +334,7 @@ int main(int argc, char **argv) {
tv2.tv_usec - tv1.tv_usec,
(double)bytes / ((long)(tv2.tv_sec - tv1.tv_sec)*1000000 +
tv2.tv_usec - tv1.tv_usec));

free(in_buf);
return 0;
}
6 changes: 4 additions & 2 deletions tests/rANS_static_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
#endif

// Room to allow for expanded BLK_SIZE on worst case compression.
#define BLK_SIZE2 ((int)(1.05*BLK_SIZE))
#define BLK_SIZE2 (1.05*BLK_SIZE)

// Max 4GB
static unsigned char *load(FILE *infp, uint32_t *lenp) {
Expand Down Expand Up @@ -93,7 +93,7 @@ static unsigned char *load(FILE *infp, uint32_t *lenp) {
*/
int main(int argc, char **argv) {
int opt, order = 0;
unsigned char in_buf[BLK_SIZE2+257*257*3];
unsigned char *in_buf = malloc(BLK_SIZE2+257*257*3);
int decode = 0, test = 0;
FILE *infp = stdin, *outfp = stdout;
struct timeval tv1, tv2, tv3;
Expand Down Expand Up @@ -286,5 +286,7 @@ int main(int argc, char **argv) {
tv2.tv_usec - tv1.tv_usec,
(double)bytes / ((long)(tv2.tv_sec - tv1.tv_sec)*1000000 +
tv2.tv_usec - tv1.tv_usec));

free(in_buf);
return 0;
}
14 changes: 11 additions & 3 deletions tests/tokenise_name3_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
#ifndef BLK_SIZE
#define BLK_SIZE 1*1024*1024
#endif
static char blk[BLK_SIZE*2]; // temporary fix for decoder, which needs more space
static char *blk;

// Max 4GB
static unsigned char *load(FILE *infp, uint32_t *lenp) {
Expand Down Expand Up @@ -218,8 +218,16 @@ int main(int argc, char **argv) {
_setmode(_fileno(stdout), _O_BINARY);
#endif

// temporary fix for decoder, which needs more space
blk = malloc(BLK_SIZE*2);

int ret;

if (argc > 1 && strcmp(argv[1], "-d") == 0)
return decode(argc-1, argv+1);
ret = decode(argc-1, argv+1);
else
return encode(argc, argv);
ret = encode(argc, argv);

free(blk);
return ret;
}

0 comments on commit 0523328

Please sign in to comment.