diff --git a/hts.c b/hts.c index 839ec7113..ddaa60bfb 100644 --- a/hts.c +++ b/hts.c @@ -3482,39 +3482,32 @@ static inline long long push_digit(long long i, char c) long long hts_parse_decimal(const char *str, char **strend, int flags) { long long n = 0; - int decimals = 0, e = 0, lost = 0, has_digit = 0; + int digits = 0, decimals = 0, e = 0, lost = 0; char sign = '+', esign = '+'; - const char *s; + const char *s, *str_orig = str; while (isspace_c(*str)) str++; s = str; if (*s == '+' || *s == '-') sign = *s++; while (*s) - if (isdigit_c(*s)) n = push_digit(n, *s++), has_digit = 1; + if (isdigit_c(*s)) digits++, n = push_digit(n, *s++); else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++; else break; if (*s == '.') { s++; - while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++), has_digit = 1; - } - - // there must have been a digit or else cannot be a valid number - if ( !has_digit ) - { - if ( strend ) *strend = (char*)str; - return 0; + while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++); } - if (*s == 'E' || *s == 'e') { + switch (*s) { + case 'e': case 'E': s++; if (*s == '+' || *s == '-') esign = *s++; while (isdigit_c(*s)) e = push_digit(e, *s++); if (esign == '-') e = -e; - } + break; - switch (*s) { case 'k': case 'K': e += 3; s++; break; case 'm': case 'M': e += 6; s++; break; case 'g': case 'G': e += 9; s++; break; @@ -3529,7 +3522,10 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) } if (strend) { - *strend = (char *)s; + // Set to the original input str pointer if not valid number syntax + *strend = (digits > 0)? (char *)s : (char *)str_orig; + } else if (digits == 0) { + hts_log_warning("Invalid numeric value %.8s[truncated]", str); } else if (*s) { if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ',')) hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s); diff --git a/htslib/hts.h b/htslib/hts.h index 48c263cab..43dfb5f97 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -1137,10 +1137,26 @@ int hts_idx_nseq(const hts_idx_t *idx); @param strend If non-NULL, set on return to point to the first character in @a str after those forming the parsed number @param flags Or'ed-together combination of HTS_PARSE_* flags - @return Converted value of the parsed number. - - When @a strend is NULL, a warning will be printed (if hts_verbose is HTS_LOG_WARNING - or more) if there are any trailing characters after the number. + @return Integer value of the parsed number, or 0 if no valid number + + The input string is parsed as: optional whitespace; an optional '+' or + '-' sign; decimal digits possibly including ',' characters (if @a flags + includes HTS_PARSE_THOUSANDS_SEP) and a '.' decimal point; and an optional + case-insensitive suffix, which may be either 'k', 'M', 'G', or scientific + notation consisting of 'e'/'E' followed by an optional '+' or '-' sign and + decimal digits. To be considered a valid numeric value, the main part (not + including any suffix or scientific notation) must contain at least one + digit (either before or after the decimal point). + + When @a strend is NULL, @a str is expected to contain only (optional + whitespace followed by) the numeric value. A warning will be printed + (if hts_verbose is HTS_LOG_WARNING or more) if no valid parsable number + is found or if there are any unused characters after the number. + + When @a strend is non-NULL, @a str starts with (optional whitespace + followed by) the numeric value. On return, @a strend is set to point + to the first unused character after the numeric value, or to @a str + if no valid parsable number is found. */ HTSLIB_EXPORT long long hts_parse_decimal(const char *str, char **strend, int flags); diff --git a/test/sam.c b/test/sam.c index b6f6c0e04..cc5bfe77a 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1655,6 +1655,37 @@ static int read_data_block(const char *in_name, samFile *fp_in, return ret; } +static void test_parse_decimal1(long long exp, const char *str, size_t exp_consumed, int flags, const char *warning) +{ + if (warning) fprintf(stderr, "(Expect %s message for \"%s\")\n", warning, str); + + long long val = hts_parse_decimal(str, NULL, flags); + if (val != exp) fail("hts_parse_decimal(\"%s\", NULL, %d) returned %lld, expected %lld", str, flags, val, exp); + + char *end; + val = hts_parse_decimal(str, &end, flags); + if (val != exp) fail("hts_parse_decimal(\"%s\", ..., %d) returned %lld, expected %lld", str, flags, val, exp); + size_t consumed = end - str; + if (consumed != exp_consumed) fail("hts_parse_decimal(\"%s\", ..., %d) consumed %zu chars, expected %zu", str, flags, consumed, exp_consumed); +} + +static void test_parse_decimal(void) +{ + test_parse_decimal1(37, "+37", 3, 0, NULL); + test_parse_decimal1(-1001, " \t -1,001x", 9, HTS_PARSE_THOUSANDS_SEP, "trailing 'x'"); + test_parse_decimal1(LLONG_MAX, "+9223372036854775807", 20, 0, NULL); + test_parse_decimal1(LLONG_MIN, "-9,223,372,036,854,775,808", 26, HTS_PARSE_THOUSANDS_SEP, NULL); + test_parse_decimal1(1500, "1.5e3", 5, 0, NULL); + test_parse_decimal1(1500, "1.5e+3k", 6, 0, "trailing 'k'"); + test_parse_decimal1(1500000000, "1.5G", 4, 0, NULL); + test_parse_decimal1(12345, "12.345k", 7, 0, NULL); + test_parse_decimal1(12345, "12.3456k", 8, 0, "dropped fraction"); + test_parse_decimal1(0, "A", 0, 0, "invalid numeric"); + test_parse_decimal1(0, "G", 0, 0, "invalid numeric"); + test_parse_decimal1(0, " +/-", 0, 0, "invalid numeric"); + test_parse_decimal1(0, " \t -.e+9999", 0, 0, "invalid numeric"); +} + static void test_mempolicy(void) { size_t bufsz = MAX_RECS * REC_LENGTH, nrecs = 0, i; @@ -2194,6 +2225,7 @@ int main(int argc, char **argv) check_cigar_tab(); check_big_ref(0); check_big_ref(1); + test_parse_decimal(); test_mempolicy(); set_qname(); for (i = 1; i < argc; i++) faidx1(argv[i]);