Skip to content

Commit

Permalink
feat: flipped bitmap can now be set before insertion
Browse files Browse the repository at this point in the history
  • Loading branch information
Taepper committed Jul 29, 2023
1 parent c4deff6 commit f61c803
Show file tree
Hide file tree
Showing 7 changed files with 167 additions and 107 deletions.
15 changes: 9 additions & 6 deletions endToEndTests/test/query.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@ describe('The /query endpoint', () => {
.map(file => JSON.parse(fs.readFileSync(`${queriesPath}/${file}`)))
.forEach(testCase =>
it('should return data for the test case ' + testCase.testCaseName, async () => {
const response = await server
.post('/query')
.send(testCase.query)
.expect(200)
.expect('Content-Type', 'application/json');
return expect(response.body.queryResult).to.deep.equal(testCase.expectedQueryResult);
let response = await server.post('/query').send(testCase.query);
try {
expect(response.status).to.equal(200);
expect(response.header['content-type']).to.equal('application/json');
} catch (error) {
console.error('Error in response header! Error body:', response.body);
throw error;
}
expect(response.body.queryResult).to.deep.equal(testCase.expectedQueryResult);
})
);

Expand Down
18 changes: 12 additions & 6 deletions include/silo/storage/aa_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,23 @@ namespace silo {
class ZstdFastaReader;
enum class AA_SYMBOL : char;

struct AAPosition {
class AAPosition {
friend class boost::serialization::access;

template <class Archive>
void serialize(Archive& archive, [[maybe_unused]] const uint32_t version) {
// clang-format off
archive& symbol_whose_bitmap_is_flipped;
archive& bitmaps;
archive & symbol_whose_bitmap_is_flipped;
archive & bitmaps;
// clang-format on
}

AAPosition() = default;

public:
explicit AAPosition(AA_SYMBOL symbol);
explicit AAPosition(std::optional<AA_SYMBOL> symbol);

AASymbolMap<roaring::Roaring> bitmaps;
std::optional<AA_SYMBOL> symbol_whose_bitmap_is_flipped = std::nullopt;
};
Expand All @@ -49,9 +55,9 @@ class AAStorePartition {
template <class Archive>
void serialize(Archive& archive, [[maybe_unused]] const uint32_t version) {
// clang-format off
archive& sequence_count;
archive& positions;
archive& aa_symbol_x_bitmaps;
archive & sequence_count;
archive & positions;
archive & aa_symbol_x_bitmaps;
// clang-format on
}

Expand Down
12 changes: 9 additions & 3 deletions include/silo/storage/sequence_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,23 @@ class access;
namespace silo {
class ZstdFastaReader;

struct NucPosition {
class NucPosition {
friend class boost::serialization::access;

template <class Archive>
void serialize(Archive& archive, [[maybe_unused]] const uint32_t version) {
// clang-format off
archive& symbol_whose_bitmap_is_flipped;
archive& bitmaps;
archive & symbol_whose_bitmap_is_flipped;
archive & bitmaps;
// clang-format on
}

NucPosition() = default;

public:
explicit NucPosition(NUCLEOTIDE_SYMBOL symbol);
explicit NucPosition(std::optional<NUCLEOTIDE_SYMBOL> symbol);

NucleotideSymbolMap<roaring::Roaring> bitmaps;
std::optional<NUCLEOTIDE_SYMBOL> symbol_whose_bitmap_is_flipped = std::nullopt;
};
Expand Down
92 changes: 53 additions & 39 deletions src/silo/storage/aa_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,21 @@
#include "silo/common/zstdfasta_reader.h"
#include "silo/preprocessing/preprocessing_exception.h"

silo::AAPosition::AAPosition(AA_SYMBOL symbol) {
symbol_whose_bitmap_is_flipped = symbol;
}

silo::AAPosition::AAPosition(std::optional<AA_SYMBOL> symbol) {
symbol_whose_bitmap_is_flipped = symbol;
}

silo::AAStorePartition::AAStorePartition(const std::vector<AA_SYMBOL>& reference_sequence)
: reference_sequence(reference_sequence) {
for (AA_SYMBOL symbol : reference_sequence) {
positions.emplace_back(symbol);
}
}

size_t silo::AAStorePartition::fill(silo::ZstdFastaReader& input_file) {
static constexpr size_t BUFFER_SIZE = 1024;

Expand Down Expand Up @@ -47,53 +62,58 @@ const roaring::Roaring* silo::AAStorePartition::getBitmap(size_t position, AA_SY
void silo::AAStorePartition::fillIndexes(const std::vector<std::string>& sequences) {
const size_t genome_length = positions.size();
static constexpr int COUNT_SYMBOLS_PER_PROCESSOR = 64;
const tbb::blocked_range<size_t> positions_range(
0, genome_length, genome_length / COUNT_SYMBOLS_PER_PROCESSOR
);
tbb::parallel_for(positions_range, [&](const decltype(positions_range)& local) {
AASymbolMap<std::vector<uint32_t>> ids_per_symbol_for_current_position;
for (size_t position = local.begin(); position != local.end(); ++position) {
const size_t number_of_sequences = sequences.size();
for (size_t sequence_id = 0; sequence_id < number_of_sequences; ++sequence_id) {
const char character = sequences[sequence_id][position];
const auto symbol = charToAASymbol(character);
if (!symbol.has_value()) {
throw PreprocessingException(
"Found invalid symbol in Amino Acid sequence: " + std::to_string(character) +
"\nFull sequence: " + sequences[sequence_id]
);
tbb::parallel_for(
tbb::blocked_range<size_t>(0, genome_length, genome_length / COUNT_SYMBOLS_PER_PROCESSOR),
[&](const auto& local) {
AASymbolMap<std::vector<uint32_t>> ids_per_symbol_for_current_position;
for (size_t position = local.begin(); position != local.end(); ++position) {
const size_t number_of_sequences = sequences.size();
for (size_t sequence_id = 0; sequence_id < number_of_sequences; ++sequence_id) {
const char character = sequences[sequence_id][position];
const auto symbol = charToAASymbol(character);
if (!symbol.has_value()) {
throw PreprocessingException(
"Found invalid symbol in Amino Acid sequence: " + std::to_string(character) +
"\nFull sequence: " + sequences[sequence_id]
);
}
if (symbol != AA_SYMBOL::X) {
ids_per_symbol_for_current_position[*symbol].push_back(
sequence_count + sequence_id
);
}
}
if (symbol != AA_SYMBOL::X) {
ids_per_symbol_for_current_position[*symbol].push_back(sequence_count + sequence_id);
}
}
for (const auto& symbol : AA_SYMBOLS) {
if (!ids_per_symbol_for_current_position.at(symbol).empty()) {
positions[position].bitmaps[symbol].addMany(
ids_per_symbol_for_current_position.at(symbol).size(),
ids_per_symbol_for_current_position.at(symbol).data()
);
ids_per_symbol_for_current_position[symbol].clear();
for (const AA_SYMBOL symbol : AA_SYMBOLS) {
if (!ids_per_symbol_for_current_position.at(symbol).empty()) {
positions[position].bitmaps[symbol].addMany(
ids_per_symbol_for_current_position.at(symbol).size(),
ids_per_symbol_for_current_position.at(symbol).data()
);
ids_per_symbol_for_current_position[symbol].clear();
}
if (symbol == positions[position].symbol_whose_bitmap_is_flipped) {
positions[position].bitmaps[symbol].flip(
sequence_count, sequence_count + number_of_sequences
);
}
}
}
}
});
);
}

void silo::AAStorePartition::fillXBitmaps(const std::vector<std::string>& sequences) {
const size_t genome_length = positions.size();

aa_symbol_x_bitmaps.resize(sequence_count + sequences.size());

const tbb::blocked_range<size_t> range(0, sequences.size());
tbb::parallel_for(range, [&](const decltype(range)& local) {
// For every symbol, calculate all sequence IDs that have that symbol at that position
tbb::parallel_for(tbb::blocked_range<size_t>(0, sequences.size()), [&](const auto& local) {
std::vector<uint32_t> positions_with_aa_symbol_x;
for (size_t sequence_id = local.begin(); sequence_id != local.end(); ++sequence_id) {
for (size_t position = 0; position < genome_length; ++position) {
const char character = sequences[sequence_id][position];
// No need to check the cast because we call fillIndexes first
const auto symbol = static_cast<AA_SYMBOL>(character);
const auto symbol = charToAASymbol(character);
if (symbol == AA_SYMBOL::X) {
positions_with_aa_symbol_x.push_back(position);
}
Expand All @@ -115,10 +135,6 @@ void silo::AAStorePartition::interpret(const std::vector<std::string>& sequences
sequence_count += sequences.size();
}

silo::AAStorePartition::AAStorePartition(const std::vector<AA_SYMBOL>& reference_sequence)
: reference_sequence(reference_sequence),
positions(reference_sequence.size()) {}

size_t silo::AAStorePartition::computeSize() const {
size_t result = 0;
for (const auto& position : positions) {
Expand All @@ -131,8 +147,7 @@ size_t silo::AAStorePartition::computeSize() const {

size_t silo::AAStorePartition::runOptimize() {
std::atomic<size_t> count_true = 0;
const tbb::blocked_range<size_t> range(0U, positions.size());
tbb::parallel_for(range, [&](const decltype(range) local) {
tbb::parallel_for(tbb::blocked_range<size_t>(0U, positions.size()), [&](const auto& local) {
for (auto position = local.begin(); position != local.end(); ++position) {
for (const AA_SYMBOL symbol : AA_SYMBOLS) {
if (positions[position].bitmaps[symbol].runOptimize()) {
Expand All @@ -146,8 +161,7 @@ size_t silo::AAStorePartition::runOptimize() {

size_t silo::AAStorePartition::shrinkToFit() {
std::atomic<size_t> saved = 0;
const tbb::blocked_range<size_t> range(0U, positions.size());
tbb::parallel_for(range, [&](const decltype(range) local) {
tbb::parallel_for(tbb::blocked_range<size_t>(0U, positions.size()), [&](const auto& local) {
size_t local_saved = 0;
for (auto position = local.begin(); position != local.end(); ++position) {
for (const AA_SYMBOL symbol : AA_SYMBOLS) {
Expand Down
22 changes: 18 additions & 4 deletions src/silo/storage/database_partition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,26 @@ void DatabasePartition::flipBitmaps() {
auto& positions = seq_store.positions;
tbb::parallel_for(tbb::blocked_range<uint32_t>(0, positions.size()), [&](const auto& local) {
for (auto position = local.begin(); position != local.end(); ++position) {
std::optional<NUCLEOTIDE_SYMBOL> flipped_bitmap_before =
positions[position].symbol_whose_bitmap_is_flipped;
std::optional<NUCLEOTIDE_SYMBOL> max_symbol = std::nullopt;
uint32_t max_count = 0;

for (const auto& symbol : NUC_SYMBOLS) {
roaring::Roaring bitmap = positions[position].bitmaps.at(symbol);
bitmap.runOptimize();
const uint32_t count = bitmap.cardinality();
const uint32_t count = flipped_bitmap_before == symbol
? sequenceCount - bitmap.cardinality()
: bitmap.cardinality();
if (count > max_count) {
max_symbol = symbol;
max_count = count;
}
}
if (max_symbol.has_value()) {
if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) {
if (flipped_bitmap_before.has_value()) {
positions[position].bitmaps[*flipped_bitmap_before].flip(0, sequenceCount);
}
positions[position].symbol_whose_bitmap_is_flipped = max_symbol;
positions[position].bitmaps[*max_symbol].flip(0, sequenceCount);
}
Expand All @@ -48,19 +55,26 @@ void DatabasePartition::flipBitmaps() {
auto& positions = seq_store.positions;
tbb::parallel_for(tbb::blocked_range<uint32_t>(0, positions.size()), [&](const auto& local) {
for (auto position = local.begin(); position != local.end(); ++position) {
std::optional<AA_SYMBOL> flipped_bitmap_before =
positions[position].symbol_whose_bitmap_is_flipped;
std::optional<AA_SYMBOL> max_symbol = std::nullopt;
uint32_t max_count = 0;

for (const auto& symbol : AA_SYMBOLS) {
roaring::Roaring bitmap = positions[position].bitmaps.at(symbol);
bitmap.runOptimize();
const uint32_t count = bitmap.cardinality();
const uint32_t count = flipped_bitmap_before == symbol
? sequenceCount - bitmap.cardinality()
: bitmap.cardinality();
if (count > max_count) {
max_symbol = symbol;
max_count = count;
}
}
if (max_symbol.has_value()) {
if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) {
if (flipped_bitmap_before.has_value()) {
positions[position].bitmaps[*flipped_bitmap_before].flip(0, sequenceCount);
}
positions[position].symbol_whose_bitmap_is_flipped = max_symbol;
positions[position].bitmaps[*max_symbol].flip(0, sequenceCount);
}
Expand Down
8 changes: 4 additions & 4 deletions src/silo/storage/position.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ void deserializeFromFile(const std::string& filename, silo::NucPosition& positio
TEST(Position, shouldSerializeAndDeserializePositionsWithEmptyOptional) {
const std::string test_file = "test.bin";

const silo::NucPosition position_with_unset_optional;
const silo::NucPosition position_with_unset_optional(std::nullopt);
serializeToFile(test_file, position_with_unset_optional);

silo::NucPosition deserialized_position;
silo::NucPosition deserialized_position(std::nullopt);
deserializeFromFile(test_file, deserialized_position);

EXPECT_FALSE(position_with_unset_optional.symbol_whose_bitmap_is_flipped.has_value());
Expand All @@ -39,11 +39,11 @@ TEST(Position, shouldSerializeAndDeserializePositionsWithEmptyOptional) {
TEST(Position, shouldSerializeAndDeserializePositionWithSetOptional) {
const std::string test_file = "test.bin";

silo::NucPosition position_with_set_optional;
silo::NucPosition position_with_set_optional(std::nullopt);
position_with_set_optional.symbol_whose_bitmap_is_flipped = silo::NUCLEOTIDE_SYMBOL::A;
serializeToFile(test_file, position_with_set_optional);

silo::NucPosition deserialized_position;
silo::NucPosition deserialized_position(std::nullopt);
deserializeFromFile(test_file, deserialized_position);

EXPECT_TRUE(deserialized_position.symbol_whose_bitmap_is_flipped.has_value());
Expand Down
Loading

0 comments on commit f61c803

Please sign in to comment.