Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[1.0.1] Implement the two tests required to validate issue #694 #711

Merged
merged 13 commits into from
Sep 6, 2024
Merged
3 changes: 2 additions & 1 deletion unittests/savanna_cluster.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ namespace savanna_cluster {
std::string snapshot() const {
dlog("node ${i} - taking snapshot", ("i", _node_idx));
auto writer = buffered_snapshot_suite::get_writer();
control->abort_block();
control->write_snapshot(writer);
return buffered_snapshot_suite::finalize(writer);
}
Expand Down Expand Up @@ -544,7 +545,7 @@ namespace savanna_cluster {
// -------------------
void print(const char* name, const signed_block_ptr& b) const {
if (_debug_mode)
std::cout << name << " ts = " << b->timestamp.slot << ", id = " << b->calculate_id().str().substr(8, 16)
std::cout << name << " (" << b->block_num() << ") timestamp = " << b->timestamp.slot << ", id = " << b->calculate_id().str().substr(8, 16)
<< ", previous = " << b->previous.str().substr(8, 16) << '\n';
}

Expand Down
32 changes: 32 additions & 0 deletions unittests/savanna_disaster_recovery_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -291,4 +291,36 @@ BOOST_FIXTURE_TEST_CASE(all_nodes_shutdown_with_reversible_blocks_lost, savanna_
}));
} FC_LOG_AND_RETHROW()


// --------------------------------------------------------------------------------------------
// test to reproduce error from issue #709. When starting a node from a snapshot with a fork_db
// containing only the root block, we access `prev_finality_ext` which is empty because the
// header extension cache has not been initialized.
// --------------------------------------------------------------------------------------------
BOOST_FIXTURE_TEST_CASE(restart_from_fork_db_with_only_root_block, savanna_cluster::cluster_t) try {
auto& C=_nodes[2];

BOOST_REQUIRE_EQUAL(2u, C.lib_advances_by([&]() { C.produce_blocks(2); }));
auto snapshot = C.snapshot();
signed_block_ptr b1, b2;
BOOST_REQUIRE_EQUAL(2u, C.lib_advances_by([&]() { b1 = C.produce_block(); b2 = C.produce_block(); }));

// Partition C by itself, so it doesn't receive b1 and b2 when opebed
const std::vector<size_t> tmp_partition {2};
set_partition(tmp_partition);

C.close();
C.remove_state();
C.remove_reversible_data_and_blocks_log();

C.open_from_snapshot(snapshot); // at this point, fork_db's root is the snapshot block, and doesn't contain any other blocks
C.close(); // close node
C.open(); // and open(), so we get the root block_state from fork_db and not from the snapshot

#if 0 // uncomment when issue #709 is fixed.
C.push_block(b1); // when creating the block_state for b1, `prev` will be the root block_state loaded from
// fork_db, which doesn't have the header extension cache created (issue #709)
#endif
} FC_LOG_AND_RETHROW()

BOOST_AUTO_TEST_SUITE_END()
271 changes: 271 additions & 0 deletions unittests/savanna_misc_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,5 +323,276 @@ BOOST_FIXTURE_TEST_CASE(gh_534_liveness_issue, savanna_cluster::cluster_t) try {

} FC_LOG_AND_RETHROW()

// ---------------------------------------------------------------------------------------------------
// validate qc after restart from snapshot with no blocklog or fork database
// -------------------------------------------------------------------------
//
// B1 <- B2 <- B3 <- B4 <- B5 <- B6
//
// where:
// B2 claims a strong QC on B1.
// B3 claims a strong QC on B1.
// B4 claims a strong QC on B2. (B4 makes B1 final.)
// B5 claims a strong QC on B4. (B5 makes B2 final.)
// B6 claims a strong QC on B5. (B6 makes B4 final.)
//
// Let's say a node operator decided to take a snapshot on B3. After their node receives B6, B4 becomes final and the
// snapshot on B3 becomes available.
//
// Then the operator shuts down nodeos and decides to restart from the snapshot on B3.
//
// After starting up from the snapshot, their node receives block B4 from the P2P network. Since B4 advances the QC
// claim relative to its parent (from a strong QC claimed on B1 to a strong QC claimed on B2), it must include a QC
// attached to justify its claim. It does in fact contain the strong QC on block B2, but how does this node verify the
// QC? It started with B3 as the root block of its fork database, so block B2 does not exist in the fork database.
// ---------------------------------------------------------------------------------------------------
BOOST_FIXTURE_TEST_CASE(validate_qc_after_restart_from_snapshot, savanna_cluster::cluster_t) try {
using namespace savanna_cluster;
auto& A=_nodes[0];

// _debug_mode = true;
auto b1 = A.produce_block(); // receives strong votes from all finalizers
print("b1", b1);

const std::vector<size_t> partition {0}; // partition A so that B, C and D don't see b2 (yet)
set_partition(partition);

auto b2 = A.produce_block(); // receives just 1 strong vote fron A
print("b2", b2);
BOOST_REQUIRE_EQUAL(qc_s(qc(b2)), strong_qc(b1)); // b4 claims a strong QC on b1

auto b3 = A.produce_block(); // b3 repeats b2 strong qc claim on b1 (because no qc on b2)
print("b3", b3);
BOOST_REQUIRE(!qc(b3));

auto b3_snapshot = A.snapshot();

set_partition({}); // remove partition so A will receive votes on b2 and b3

push_block(0, b2); // other nodes receive b2 and vote on it, so A forms a qc on b2
auto b4 = A.produce_block();
print("b4", b4);
BOOST_REQUIRE_EQUAL(qc_s(qc(b4)), strong_qc(b2)); // b4 claims a strong QC on b2. (b4 makes b1 final.)
BOOST_REQUIRE_EQUAL(A.lib_number, b1->block_num());

push_block(0, b3);
push_block(0, b4); // push b4 again as it was unlinkable until the other
// nodes received b3

auto b5 = A.produce_block();
print("b5", b5);
BOOST_REQUIRE_EQUAL(qc_s(qc(b5)), strong_qc(b4)); // b5 claims a strong QC on b4. (b5 makes b2 final.)
BOOST_REQUIRE_EQUAL(A.lib_number, b2->block_num());

auto b6 = A.produce_block();
print("b6", b6);
BOOST_REQUIRE_EQUAL(qc_s(qc(b6)), strong_qc(b5)); // b6 claims a strong QC on b5. (b6 makes b4 final.)
BOOST_REQUIRE_EQUAL(A.lib_number, b4->block_num());

// Then the operator shuts down nodeos and decides to restart from the snapshot on B3.
A.close();
A.remove_state();
A.remove_reversible_data_and_blocks_log();

set_partition({0}); // partition A so it doesn't receive blocks on `open()`
A.open_from_snapshot(b3_snapshot);

#if 0 // uncomment when issue #694 is fixed.
// After starting up from the snapshot, their node receives block b4 from the P2P network.
// Since b4 advances the QC claim relative to its parent (from a strong QC claimed on b1
// to a strong QC claimed on b2), it must include a QC attached to justify its claim.
// It does in fact contain the strong QC on block b2, but how does this node verify the QC?
// It started with b3 as the root block of its fork database, so block b2 does not exist in
// the fork database.
// -----------------------------------------------------------------------------------------
A.push_block(b4); // when pushing b4, if we try to access any block state
A.push_block(b5); // before b3, we will fail with a `verify_qc_claim`
A.push_block(b6); // exception, which is what will happens until issue
// #694 is addressed.
#endif
} FC_LOG_AND_RETHROW()


// ---------------------------------------------------------------------------------------------------
// Missing finalizer policies needed to validate qc after
// restart from snapshot with no blocklog or fork database
// -------------------------------------------------------
//
//
// The node processes the following blockchain:
//
// <- B1 <- B2 <- B3 <- B4 <- B5 <- B6 <- B7 <- B8 <- B9
//
// where:
//
// B1 has active finalizer policy P1 and pending finalizer policy.
// B1 proposes finalizer policy P2.
//
// B2 claims a strong QC on B1.
// B2 has active finalizer policy P1 and no pending finalizer policy.
//
// B3 claims a strong QC on B2. (B3 makes B1 final.)
// B3 has active finalizer policy P1 and has pending finalizer policy P2.
//
// B4 claims a strong QC on B3. (B4 makes B2 final.)
// B4 has active finalizer policy P1 and has pending finalizer policy P2.
//
// B5 claims a strong QC on B3.
// B5 has active finalizer policy P1 and has pending finalizer policy P2.
//
// B6 claims a strong QC on B4. (B5 makes B3 final.)
// B6 has active finalizer policy P2 and no pending finalizer policy.
// (At this point, in the current implementation policy P2 is lost from the block_header_state
// of B6, which is the source of the problem.)
//
// B7 claims a strong QC on B5.
// B7 has active finalizer policy P2 and no pending finalizer policy.
//
// B8 claims a strong QC on B6. (B8 makes B4 final.)
// B8 has active finalizer policy P2 and no pending finalizer policy.
//
// B9 claims a strong QC on B8. (B9 makes B6 final.)
// B9 has active finalizer policy P2 and no pending finalizer policy.
//
// The node operator decided to take a snapshot on B6. After their node receives B9, B6 becomes
// final and the snapshot on B6 becomes available to the node operator as a valid snapshot.
//
// Then the operator shuts down nodeos and decides to restart from the snapshot on B6.
//
// After starting up from the snapshot, their node receives block B7 from the P2P network.
// Since B7 advances the QC claim relative to its parent (from a strong QC claimed on B4 to a
// strong QC claimed on B5), it must include a QC attached to justify its claim. It does in fact
// contain the strong QC on block B5, but how does this node verify the QC? It started with B6
// as the root block of its fork database, so block B5 does not exist in the fork database.
//
// Yes, the finality digest for B5 can be retrieved from the finality_core in the block_header_state
// for B6. But the block_header_state of B6 contains an active_finalizer_policy of policy P2 and it
// contains no pending_finalizer_policy. Not only does it not know the generation numbers for the
// active and pending (if present) finalizer policies of B5, even if it did know the generation
// numbers, it simply would no longer have policy P1 which it needs to validate the QC for block B5.
//
// The solution is to augment the state tracked in block_header_state.
//
// ---------------------------------------------------------------------------------------------------
BOOST_FIXTURE_TEST_CASE(validate_qc_requiring_finalizer_policies, savanna_cluster::cluster_t) try {
using namespace savanna_cluster;
auto& A=_nodes[0];

// _debug_mode = true;

// update finalizer_policy with a new key for B
// --------------------------------------------
base_tester::finalizer_policy_input input;
for (size_t i=0; i<num_nodes(); ++i)
input.finalizers.emplace_back(_fin_keys[i], 1);
input.finalizers[1] = { _fin_keys[num_nodes()], 1 }; // overwrite finalizer key for B
input.threshold = (input.finalizers.size() * 2) / 3 + 1;
A.set_finalizers(input);

auto b1 = A.produce_block(); // b1 has active finalizer policy p1 and pending finalizer policy.
print("b1", b1); // b1 proposes finalizer policy p2.
auto p1 = A.head_active_finalizer_policy()->generation;

auto b2 = A.produce_block();
print("b2", b2);
BOOST_REQUIRE_EQUAL(qc_s(qc(b2)), strong_qc(b1)); // b2 claims a strong QC on b1

auto b3 = A.produce_block();
print("b3", b3);
BOOST_REQUIRE_EQUAL(qc_s(qc(b3)), strong_qc(b2)); // b3 claims a strong QC on b2
BOOST_REQUIRE_EQUAL(A.lib_number, b1->block_num()); // b3 makes B1 final

auto pending = A.head_pending_finalizer_policy();
BOOST_REQUIRE(!!pending); // check that we have a pending finalizer policy
auto p2 = pending->generation; // and its generation is higher than the active one
BOOST_REQUIRE_EQUAL(p2, p1 + 1); // b3 has new pending finalizer policy p2

const std::vector<size_t> partition {0}; // partition A so that B, C and D don't see b4 (yet)
set_partition(partition); // and don't vote on it

auto b4 = A.produce_block();
print("b4", b4);
BOOST_REQUIRE_EQUAL(qc_s(qc(b4)), strong_qc(b3)); // b4 claims a strong QC on b3
BOOST_REQUIRE_EQUAL(A.lib_number, b2->block_num()); // b4 makes B2 final
pending = A.head_pending_finalizer_policy();
BOOST_REQUIRE_EQUAL(pending->generation, p2); // b4 has new pending finalizer policy p2

auto b5 = A.produce_block();
print("b5", b5);
BOOST_REQUIRE(!qc(b5)); // b5 doesn't include a new qc (duplicates b4's strong claim on b3)
BOOST_REQUIRE_EQUAL(A.lib_number, b2->block_num()); // finality unchanged stays at b2
pending = A.head_pending_finalizer_policy();
BOOST_REQUIRE_EQUAL(pending->generation, p2); // b5 still has new pending finalizer policy p2
// since finality did not advance

set_partition({}); // remove partition so A will receive votes on b4 and b5

push_block(0, b4); // other nodes receive b4 and vote on it, so A forms a qc on b4
auto b6 = A.produce_block();
print("b6", b6);
BOOST_REQUIRE_EQUAL(qc_s(qc(b6)), strong_qc(b4)); // b6 claims a strong QC on b4
BOOST_REQUIRE_EQUAL(A.lib_number, b3->block_num()); // b6 makes b3 final.

auto active = A.head_active_finalizer_policy();
BOOST_REQUIRE_EQUAL(active->generation, p2); // b6 has active finalizer policy p2
BOOST_REQUIRE(!A.head_pending_finalizer_policy()); // and no pending finalizer policy.

auto b6_snapshot = A.snapshot();

// At this point, in the current implementation policy ...

push_block(0, b5);

auto b7 = A.produce_block();
print("b7", b7);
BOOST_REQUIRE_EQUAL(qc_s(qc(b7)), strong_qc(b5)); // b7 claims a strong QC on b5
BOOST_REQUIRE_EQUAL(A.lib_number, b3->block_num()); // lib is still b3

active = A.head_active_finalizer_policy();
BOOST_REQUIRE_EQUAL(active->generation, p2); // b7 has active finalizer policy p2
BOOST_REQUIRE(!A.head_pending_finalizer_policy()); // and no pending finalizer policy.

push_block(0, b6); // push b6 again as it was unlinkable until the other
// nodes received b5

auto b8 = A.produce_block();
print("b8", b8);
BOOST_REQUIRE_EQUAL(qc_s(qc(b8)), strong_qc(b6)); // b8 claims a strong QC on b6
BOOST_REQUIRE_EQUAL(A.lib_number, b4->block_num()); // b8 makes B4 final

active = A.head_active_finalizer_policy();
BOOST_REQUIRE_EQUAL(active->generation, p2); // b8 has active finalizer policy p2
BOOST_REQUIRE(!A.head_pending_finalizer_policy()); // and no pending finalizer policy.

push_block(0, b7); // push b7 and b8 as they were unlinkable until the other
push_block(0, b8); // nodes received b6

auto b9 = A.produce_block();
print("b9", b9);
BOOST_REQUIRE_EQUAL(qc_s(qc(b9)), strong_qc(b8)); // b9 claims a strong QC on b8
BOOST_REQUIRE_EQUAL(A.lib_number, b6->block_num()); // b9 makes B6 final

active = A.head_active_finalizer_policy();
BOOST_REQUIRE_EQUAL(active->generation, p2); // b9 has active finalizer policy p2
BOOST_REQUIRE(!A.head_pending_finalizer_policy()); // and no pending finalizer policy.

// restart from b6 snapshot.
// -------------------------
A.close();
A.remove_state();
A.remove_reversible_data_and_blocks_log();

set_partition({0}); // partition A so it doesn't receive blocks on `open()`
A.open_from_snapshot(b6_snapshot);

#if 0 // uncomment when issue #694 is fixed.
A.push_block(b7); // when pushing b7, if we try to access any block state
A.push_block(b8); // before b6, we will fail with a `verify_qc_claim`
A.push_block(b9); // exception, which is what will happens until issue
// #694 is addressed.
#endif

} FC_LOG_AND_RETHROW()


BOOST_AUTO_TEST_SUITE_END()