Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster nibble parsing and a SSSE3 routine for even faster nibble parsing #1677

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 66 additions & 8 deletions sam_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ DEALINGS IN THE SOFTWARE. */
#include <errno.h>
#include <stdint.h>
#include "htslib/sam.h"

#ifdef __SSSE3__
#include "tmmintrin.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -87,15 +89,71 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) {
"B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN"
"N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN";

int i, len2 = len/2;
seq[0] = 0;

for (i = 0; i < len2; i++)
const char *seq_end_ptr = seq + len;
char *seq_cursor = seq;
const uint8_t *nibble_cursor = nib;
const char *seq_end_ptr_twoatatime = seq + (len & (~1ULL));
#ifdef __SSSE3__
const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i));
__m128i first_upper_shuffle = _mm_setr_epi8(
0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff);
__m128i first_lower_shuffle = _mm_setr_epi8(
0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7);
__m128i second_upper_shuffle = _mm_setr_epi8(
8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff);
__m128i second_lower_shuffle = _mm_setr_epi8(
0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15);
__m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
/* Work on 16 encoded characters at the time resulting in 32 decoded characters
Examples are given for 8 encoded characters A until H to keep it readable.
Encoded stored as |AB|CD|EF|GH|
Shuffle into |AB|00|CD|00|EF|00|GH|00| and
|00|AB|00|CD|00|EF|00|GH|
Shift upper to the right resulting into
|0A|B0|0C|D0|0E|F0|0G|H0| and
|00|AB|00|CD|00|EF|00|GH|
Merge with or resulting into (X stands for garbage)
|0A|XB|0C|XD|0E|XF|0G|XH|
Bitwise and with 0b1111 leads to:
|0A|0B|0C|0D|0E|0F|0G|0H|
We can use the resulting 4-bit integers as indexes for the shuffle of
the nucleotide lookup. */
while (seq_cursor < seq_vec_end_ptr) {
__m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor);

__m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
__m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
__m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
__m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
__m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111));
__m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
_mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);

__m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
__m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
__m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
__m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
__m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111));
__m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
_mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides);

nibble_cursor += sizeof(__m128i);
seq_cursor += 2 * sizeof(__m128i);
}
#endif
while (seq_cursor < seq_end_ptr_twoatatime) {
// Note size_t cast helps gcc optimiser.
memcpy(&seq[i*2], &code2base[(size_t)nib[i]*2], 2);

if ((i *= 2) < len)
seq[i] = seq_nt16_str[bam_seqi(nib, i)];
memcpy(seq_cursor, code2base + ((size_t)*nibble_cursor * 2), 2);
seq_cursor += 2;
nibble_cursor += 1;
}
if (seq_cursor != seq_end_ptr) {
/* There is a single encoded nuc left */
uint8_t nibble_c = *nibble_cursor;
uint8_t upper_nuc_index = nibble_c >> 4;
seq_cursor[0] = seq_nt16_str[upper_nuc_index];
}
}

#ifdef __cplusplus
Expand Down