From 6a2caabbd7d7238056999c5c89eb0778a79df8ba Mon Sep 17 00:00:00 2001 From: Maxim Sharabayko Date: Mon, 21 Dec 2020 13:58:18 +0100 Subject: [PATCH 1/7] Implementing new stability timeout --- CMakeLists.txt | 1 + srtcore/core.h | 4 + srtcore/group.cpp | 197 +++++++++++++++++++++++++++++----------------- 3 files changed, 130 insertions(+), 72 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c5f06d3f7..626bbc7e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,6 +96,7 @@ endforeach() # SRT_DEBUG_TSBPD_WRAP 1 /* Debug packet timestamp wraparound */ # SRT_DEBUG_TLPKTDROP_DROPSEQ 1 # SRT_DEBUG_SNDQ_HIGHRATE 1 +# SRT_DEBUG_BONDING_STATES 1 # SRT_MAVG_SAMPLING_RATE 40 /* Max sampling rate */ # option defaults diff --git a/srtcore/core.h b/srtcore/core.h index b5cd59d08..90bd637ee 100644 --- a/srtcore/core.h +++ b/srtcore/core.h @@ -374,9 +374,12 @@ class CUDT bool isOPT_TsbPd() const { return m_bOPT_TsbPd; } int RTT() const { return m_iRTT; } + int RTTVar() const { return m_iRTTVar; } int32_t sndSeqNo() const { return m_iSndCurrSeqNo; } int32_t schedSeqNo() const { return m_iSndNextSeqNo; } bool overrideSndSeqNo(int32_t seq); + srt::sync::steady_clock::time_point LastRspTime() const { return m_tsLastRspTime; } + srt::sync::steady_clock::time_point ActivatedSince() const { return m_tsActivationSince; } int32_t rcvSeqNo() const { return m_iRcvCurrSeqNo; } int flowWindowSize() const { return m_iFlowWindowSize; } @@ -386,6 +389,7 @@ class CUDT int MSS() const { return m_iMSS; } uint32_t latency_us() const {return m_iTsbPdDelay_ms*1000; } + int peer_idle_tout_ms() const { return m_iOPT_PeerIdleTimeout; } size_t maxPayloadSize() const { return m_iMaxSRTPayloadSize; } size_t OPT_PayloadSize() const { return m_zOPT_ExpPayloadSize; } int sndLossLength() { return m_pSndLossList->getLossLength(); } diff --git a/srtcore/group.cpp b/srtcore/group.cpp index c76b8d6c4..88030ba17 100644 --- a/srtcore/group.cpp +++ b/srtcore/group.cpp @@ -2969,6 +2969,118 @@ void CUDTGroup::sendBackup_CheckIdleTime(gli_t w_d) } } +#if SRT_DEBUG_BONDING_STATES +class StabilityTracer +{ +public: + StabilityTracer() + { + } + + ~StabilityTracer() + { + srt::sync::ScopedLock lck(m_mtx); + m_fout.close(); + } + + void trace(const CUDT& u, const srt::sync::steady_clock::time_point& currtime, int64_t stability_tmo_us, const std::string& state) + { + srt::sync::ScopedLock lck(m_mtx); + create_file(); + + m_fout << srt::sync::FormatTime(currtime) << ","; + m_fout << u.id() << ","; + m_fout << u.RTT() << ","; + m_fout << u.RTTVar() << ","; + m_fout << stability_tmo_us << ","; + m_fout << count_microseconds(currtime - u.LastRspTime()) << ","; + m_fout << state << ","; + m_fout << (srt::sync::is_zero(u.ActivatedSince()) ? -1 : (count_microseconds(currtime - u.ActivatedSince()))) << "\n"; + m_fout.flush(); + } + +private: + void print_header() + { + //srt::sync::ScopedLock lck(m_mtx); + m_fout << "Timepoint,SocketID,usRTT,usRTTVar,usStabilityTimeout,usSinceLastResp,State,usSinceActivation\n"; + } + + void create_file() + { + if (m_fout) + return; + + std::string str_tnow = srt::sync::FormatTimeSys(srt::sync::steady_clock::now()); + str_tnow.resize(str_tnow.size() - 6); // remove trailing ' [SYS]' part + while (str_tnow.find(':') != std::string::npos) { + str_tnow.replace(str_tnow.find(':'), 1, 1, '_'); + } + const std::string fname = "stability_trace_" + str_tnow + ".csv"; + m_fout.open(fname, std::ofstream::out); + if (!m_fout) + std::cerr << "IPE: Failed to open " << fname << "!!!\n"; + + print_header(); + } + +private: + srt::sync::Mutex m_mtx; + std::ofstream m_fout; +}; + +StabilityTracer s_stab_trace; +#endif + +/// @retval 1 - link is identified as stable +/// @retval 0 - link state remains unchanged (too early to identify, still in activation phase) +/// @retval -1 - link is identified as unstable +static int sendBackup_CheckRunningLinkStable(const CUDT& u, const srt::sync::steady_clock::time_point& currtime) +{ + // There was already a response from peer while we are here. + // m_tsUnstableSince = 0; + // Do we need to keep the activation phase? + if (currtime <= u.LastRspTime()) { +#if SRT_DEBUG_BONDING_STATES + s_stab_trace.trace(u, currtime, -1, "STABLE"); +#endif + return 1; + } + + // RTT and RTTVar values during activation period are still being refined, + // therefore it is incorrect to use the dymanic timeout. + const uint32_t latency_us = u.latency_us(); + const uint32_t activation_period_us = latency_us + 50000; + const bool is_activation_phase = !is_zero(u.ActivatedSince()) + && (count_microseconds(currtime - u.ActivatedSince()) < activation_period_us); + + const int32_t min_stability_us = 60000; // Minimum Link Stability Timeout: 60ms. + const int peer_idle_tout_us = u.peer_idle_tout_ms() * 1000; + + SRT_ASSERT(latency_us < peer_idle_tout_us); + const int64_t stability_tout_us = is_activation_phase + ? min(max(min_stability_us, latency_us), peer_idle_tout_us) // activation phase + : min(max(min_stability_us, 2 * u.RTT() + 4 * u.RTTVar()), latency_us); + + // TODO: td_response = currtime - max(u.LastRspTime(), u.ActivatedSince()); + // TODO: remove m_iOptGroupStabTimeout + const steady_clock::duration td_response = currtime - u.LastRspTime(); + if (count_microseconds(td_response) > stability_tout_us) + { +#if SRT_DEBUG_BONDING_STATES + s_stab_trace.trace(u, currtime, stability_tout_us, "UNSTABLE"); +#endif + return -1; + } + + // u.LastRspTime() > currtime is alwats true due to the very first check above in this function +#if SRT_DEBUG_BONDING_STATES + s_stab_trace.trace(u, currtime, stability_tout_us, "STABLE"); +#endif + return is_activation_phase ? 0 : 1; +} + + // [[using locked(this->m_GroupLock)]] bool CUDTGroup::sendBackup_CheckRunningStability(const gli_t d, const time_point currtime) { @@ -2985,8 +3097,6 @@ bool CUDTGroup::sendBackup_CheckRunningStability(const gli_t d, const time_point // negative value is relatively easy, while introducing a mutex would only add a // deadlock risk and performance degradation. - bool is_stable = true; - HLOGC(gslog.Debug, log << "grp/sendBackup: CHECK STABLE: @" << d->id << ": TIMEDIFF {response= " << FormatDuration(currtime - u.m_tsLastRspTime) @@ -2996,89 +3106,34 @@ bool CUDTGroup::sendBackup_CheckRunningStability(const gli_t d, const time_point << (!is_zero(u.m_tsUnstableSince) ? FormatDuration(currtime - u.m_tsUnstableSince) : "NEVER") << "}"); - if (currtime > u.m_tsLastRspTime) - { - // The last response predates the start of this function, look at the difference - const steady_clock::duration td_responsive = currtime - u.m_tsLastRspTime; - bool check_stability = true; - if (!is_zero(u.m_tsFreshActivation) && u.m_tsFreshActivation < currtime) - { - // The link is temporary-activated. Calculate then since the activation time. + const int is_stable = sendBackup_CheckRunningLinkStable(u, currtime); - // Check the last received ACK time first. This time is initialized with 'now' - // at the CUDT::open call, so you can't count on the trap zero time here, but - // it's still possible to check if activation time predates the ACK time. Things - // are here in the following possible order: - // - // - ACK time (old because defined at open) - // - Response time (old because the time of received handshake or keepalive counts) - // ... long time nothing ... - // - Activation time. - // - // If we have this situation, we have to wait for at least one ACK that is - // newer than activation time. However, if in this situation we have a fresh - // response, that is: - // - // - ACK time - // ... - // - Activation time - // - Response time (because a Keepalive had a caprice to come accidentally after sending) - // - // We still wait for a situation that there's at least one ACK that is newer than activation. - - // As we DO have activation time, we need to check if there's at least - // one ACK newer than activation, that is, td_acked < td_active - if (u.m_tsLastRspAckTime < u.m_tsFreshActivation) - { - check_stability = false; - HLOGC(gslog.Debug, - log << "grp/sendBackup: link @" << d->id - << " activated after ACK, " - "not checking for stability"); - } - else - { - u.m_tsFreshActivation = steady_clock::time_point(); - } - } - - if (check_stability && count_microseconds(td_responsive) > m_uOPT_StabilityTimeout) - { - if (is_zero(u.m_tsUnstableSince)) - { - HLOGC(gslog.Debug, - log << "grp/sendBackup: socket NEW UNSTABLE: @" << d->id << " last heard " - << FormatDuration(td_responsive) << " > " << m_uOPT_StabilityTimeout - << " (stability timeout)"); - // The link seems to have missed two ACKs already. - // Qualify this link as unstable - // Notify that it has been seen so since now - u.m_tsUnstableSince = currtime; - } - - is_stable = false; - } - } - - if (is_stable) + if (is_stable >= 0) { - // If stability is ok, but unstable-since was set before, reset it. HLOGC(gslog.Debug, log << "grp/sendBackup: link STABLE: @" << d->id << (!is_zero(u.m_tsUnstableSince) ? " - RESTORED" : " - CONTINUED") << ", state RUNNING - will send a payload"); u.m_tsUnstableSince = steady_clock::time_point(); + + // For some cases + if (is_stable > 0) + u.m_tsActivationSince = steady_clock::time_point(); } else { HLOGC(gslog.Debug, log << "grp/sendBackup: link UNSTABLE for " << FormatDuration(currtime - u.m_tsUnstableSince) << " : @" << d->id << " - will send a payload"); + if (is_zero(u.m_tsUnstableSince)) + { + u.m_tsUnstableSince = currtime; + } } - return is_stable; + return is_stable >= 0; } // [[using locked(this->m_GroupLock)]] @@ -3801,12 +3856,10 @@ void CUDTGroup::sendBackup_SilenceRedundantLinks(vector& w_parallel) } CUDT& ce = d->ps->core(); steady_clock::duration td(0); - if (!is_zero(ce.m_tsFreshActivation) && - count_microseconds(td = currtime - ce.m_tsFreshActivation) < ce.m_uOPT_StabilityTimeout) + if (!is_zero(ce.m_tsActivationSince) && sendBackup_CheckRunningLinkStable(ce, currtime) != 1) { HLOGC(gslog.Debug, - log << "... not silencing @" << d->id << ": too early: " << FormatDuration(td) << " < " - << ce.m_uOPT_StabilityTimeout << "(stability timeout)"); + log << "... not silencing @" << d->id << ": too early: " << FormatDuration(td)); continue; } From f6cd021567776b458d856d762a4d2118809167fb Mon Sep 17 00:00:00 2001 From: Maxim Sharabayko Date: Fri, 29 Jan 2021 13:32:57 +0100 Subject: [PATCH 2/7] Renamed to m_tsFreshActivation --- srtcore/core.h | 2 +- srtcore/group.cpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/srtcore/core.h b/srtcore/core.h index 90bd637ee..18d60c38d 100644 --- a/srtcore/core.h +++ b/srtcore/core.h @@ -379,7 +379,7 @@ class CUDT int32_t schedSeqNo() const { return m_iSndNextSeqNo; } bool overrideSndSeqNo(int32_t seq); srt::sync::steady_clock::time_point LastRspTime() const { return m_tsLastRspTime; } - srt::sync::steady_clock::time_point ActivatedSince() const { return m_tsActivationSince; } + srt::sync::steady_clock::time_point FreshActivationStart() const { return m_tsFreshActivation; } int32_t rcvSeqNo() const { return m_iRcvCurrSeqNo; } int flowWindowSize() const { return m_iFlowWindowSize; } diff --git a/srtcore/group.cpp b/srtcore/group.cpp index 88030ba17..2405c6863 100644 --- a/srtcore/group.cpp +++ b/srtcore/group.cpp @@ -2995,7 +2995,7 @@ class StabilityTracer m_fout << stability_tmo_us << ","; m_fout << count_microseconds(currtime - u.LastRspTime()) << ","; m_fout << state << ","; - m_fout << (srt::sync::is_zero(u.ActivatedSince()) ? -1 : (count_microseconds(currtime - u.ActivatedSince()))) << "\n"; + m_fout << (srt::sync::is_zero(u.FreshActivationStart()) ? -1 : (count_microseconds(currtime - u.FreshActivationStart()))) << "\n"; m_fout.flush(); } @@ -3051,8 +3051,8 @@ static int sendBackup_CheckRunningLinkStable(const CUDT& u, const srt::sync::ste // therefore it is incorrect to use the dymanic timeout. const uint32_t latency_us = u.latency_us(); const uint32_t activation_period_us = latency_us + 50000; - const bool is_activation_phase = !is_zero(u.ActivatedSince()) - && (count_microseconds(currtime - u.ActivatedSince()) < activation_period_us); + const bool is_activation_phase = !is_zero(u.FreshActivationStart()) + && (count_microseconds(currtime - u.FreshActivationStart()) < activation_period_us); const int32_t min_stability_us = 60000; // Minimum Link Stability Timeout: 60ms. const int peer_idle_tout_us = u.peer_idle_tout_ms() * 1000; @@ -3062,7 +3062,7 @@ static int sendBackup_CheckRunningLinkStable(const CUDT& u, const srt::sync::ste ? min(max(min_stability_us, latency_us), peer_idle_tout_us) // activation phase : min(max(min_stability_us, 2 * u.RTT() + 4 * u.RTTVar()), latency_us); - // TODO: td_response = currtime - max(u.LastRspTime(), u.ActivatedSince()); + // TODO: td_response = currtime - max(u.LastRspTime(), u.FreshActivationStart()); // TODO: remove m_iOptGroupStabTimeout const steady_clock::duration td_response = currtime - u.LastRspTime(); if (count_microseconds(td_response) > stability_tout_us) @@ -3120,7 +3120,7 @@ bool CUDTGroup::sendBackup_CheckRunningStability(const gli_t d, const time_point // For some cases if (is_stable > 0) - u.m_tsActivationSince = steady_clock::time_point(); + u.m_tsFreshActivation = steady_clock::time_point(); } else { @@ -3856,7 +3856,7 @@ void CUDTGroup::sendBackup_SilenceRedundantLinks(vector& w_parallel) } CUDT& ce = d->ps->core(); steady_clock::duration td(0); - if (!is_zero(ce.m_tsActivationSince) && sendBackup_CheckRunningLinkStable(ce, currtime) != 1) + if (!is_zero(ce.m_tsFreshActivation) && sendBackup_CheckRunningLinkStable(ce, currtime) != 1) { HLOGC(gslog.Debug, log << "... not silencing @" << d->id << ": too early: " << FormatDuration(td)); From 938d17bd905dfed230ea591d83c9647be5b8c3c3 Mon Sep 17 00:00:00 2001 From: Maxim Sharabayko Date: Fri, 29 Jan 2021 17:29:08 +0100 Subject: [PATCH 3/7] Added link weight to trace log. Removed early exit from activation phase. --- srtcore/core.h | 2 +- srtcore/group.cpp | 25 ++++++++++++++----------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/srtcore/core.h b/srtcore/core.h index 18d60c38d..3dc097c2a 100644 --- a/srtcore/core.h +++ b/srtcore/core.h @@ -388,7 +388,7 @@ class CUDT int64_t maxBandwidth() const { return m_llMaxBW; } int MSS() const { return m_iMSS; } - uint32_t latency_us() const {return m_iTsbPdDelay_ms*1000; } + uint32_t peer_latency_us() const {return m_iPeerTsbPdDelay_ms * 1000; } int peer_idle_tout_ms() const { return m_iOPT_PeerIdleTimeout; } size_t maxPayloadSize() const { return m_iMaxSRTPayloadSize; } size_t OPT_PayloadSize() const { return m_zOPT_ExpPayloadSize; } diff --git a/srtcore/group.cpp b/srtcore/group.cpp index 2405c6863..2dc3115e8 100644 --- a/srtcore/group.cpp +++ b/srtcore/group.cpp @@ -2983,13 +2983,15 @@ class StabilityTracer m_fout.close(); } - void trace(const CUDT& u, const srt::sync::steady_clock::time_point& currtime, int64_t stability_tmo_us, const std::string& state) + void trace(const CUDT& u, const srt::sync::steady_clock::time_point& currtime, int64_t stability_tmo_us, const std::string& state, uint16_t weight) { srt::sync::ScopedLock lck(m_mtx); create_file(); m_fout << srt::sync::FormatTime(currtime) << ","; m_fout << u.id() << ","; + m_fout << weight << ","; + m_fout << u.peer_latency_us() << ","; m_fout << u.RTT() << ","; m_fout << u.RTTVar() << ","; m_fout << stability_tmo_us << ","; @@ -3003,7 +3005,7 @@ class StabilityTracer void print_header() { //srt::sync::ScopedLock lck(m_mtx); - m_fout << "Timepoint,SocketID,usRTT,usRTTVar,usStabilityTimeout,usSinceLastResp,State,usSinceActivation\n"; + m_fout << "Timepoint,SocketID,weight,usLatency,usRTT,usRTTVar,usStabilityTimeout,usSinceLastResp,State,usSinceActivation\n"; } void create_file() @@ -3032,24 +3034,25 @@ class StabilityTracer StabilityTracer s_stab_trace; #endif +/// TODO: Remove 'weight' parameter. Only needed for logging. /// @retval 1 - link is identified as stable /// @retval 0 - link state remains unchanged (too early to identify, still in activation phase) /// @retval -1 - link is identified as unstable -static int sendBackup_CheckRunningLinkStable(const CUDT& u, const srt::sync::steady_clock::time_point& currtime) +static int sendBackup_CheckRunningLinkStable(const CUDT& u, const srt::sync::steady_clock::time_point& currtime, uint16_t weight) { // There was already a response from peer while we are here. // m_tsUnstableSince = 0; // Do we need to keep the activation phase? - if (currtime <= u.LastRspTime()) { + /*if (currtime <= u.LastRspTime()) { #if SRT_DEBUG_BONDING_STATES - s_stab_trace.trace(u, currtime, -1, "STABLE"); + s_stab_trace.trace(u, currtime, -1, "STABLE", weight); #endif return 1; - } + }*/ // RTT and RTTVar values during activation period are still being refined, // therefore it is incorrect to use the dymanic timeout. - const uint32_t latency_us = u.latency_us(); + const uint32_t latency_us = u.peer_latency_us(); const uint32_t activation_period_us = latency_us + 50000; const bool is_activation_phase = !is_zero(u.FreshActivationStart()) && (count_microseconds(currtime - u.FreshActivationStart()) < activation_period_us); @@ -3068,14 +3071,14 @@ static int sendBackup_CheckRunningLinkStable(const CUDT& u, const srt::sync::ste if (count_microseconds(td_response) > stability_tout_us) { #if SRT_DEBUG_BONDING_STATES - s_stab_trace.trace(u, currtime, stability_tout_us, "UNSTABLE"); + s_stab_trace.trace(u, currtime, stability_tout_us, "UNSTABLE", weight); #endif return -1; } // u.LastRspTime() > currtime is alwats true due to the very first check above in this function #if SRT_DEBUG_BONDING_STATES - s_stab_trace.trace(u, currtime, stability_tout_us, "STABLE"); + s_stab_trace.trace(u, currtime, stability_tout_us, "STABLE", weight); #endif return is_activation_phase ? 0 : 1; } @@ -3107,7 +3110,7 @@ bool CUDTGroup::sendBackup_CheckRunningStability(const gli_t d, const time_point << "}"); - const int is_stable = sendBackup_CheckRunningLinkStable(u, currtime); + const int is_stable = sendBackup_CheckRunningLinkStable(u, currtime, d->weight); if (is_stable >= 0) { @@ -3856,7 +3859,7 @@ void CUDTGroup::sendBackup_SilenceRedundantLinks(vector& w_parallel) } CUDT& ce = d->ps->core(); steady_clock::duration td(0); - if (!is_zero(ce.m_tsFreshActivation) && sendBackup_CheckRunningLinkStable(ce, currtime) != 1) + if (!is_zero(ce.m_tsFreshActivation) && sendBackup_CheckRunningLinkStable(ce, currtime, d->weight) != 1) { HLOGC(gslog.Debug, log << "... not silencing @" << d->id << ": too early: " << FormatDuration(td)); From 8c925fa6805efe149568a5d3a90b0fd7fa8f6927 Mon Sep 17 00:00:00 2001 From: Maxim Sharabayko Date: Fri, 29 Jan 2021 17:47:59 +0100 Subject: [PATCH 4/7] Added activation period to stab trace --- srtcore/group.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/srtcore/group.cpp b/srtcore/group.cpp index 2dc3115e8..694cc6249 100644 --- a/srtcore/group.cpp +++ b/srtcore/group.cpp @@ -2983,7 +2983,8 @@ class StabilityTracer m_fout.close(); } - void trace(const CUDT& u, const srt::sync::steady_clock::time_point& currtime, int64_t stability_tmo_us, const std::string& state, uint16_t weight) + void trace(const CUDT& u, const srt::sync::steady_clock::time_point& currtime, uint32_t activation_period_us, + int64_t stability_tmo_us, const std::string& state, uint16_t weight) { srt::sync::ScopedLock lck(m_mtx); create_file(); @@ -2997,7 +2998,8 @@ class StabilityTracer m_fout << stability_tmo_us << ","; m_fout << count_microseconds(currtime - u.LastRspTime()) << ","; m_fout << state << ","; - m_fout << (srt::sync::is_zero(u.FreshActivationStart()) ? -1 : (count_microseconds(currtime - u.FreshActivationStart()))) << "\n"; + m_fout << (srt::sync::is_zero(u.FreshActivationStart()) ? -1 : (count_microseconds(currtime - u.FreshActivationStart()))) << ","; + m_fout << activation_period_us << "\n"; m_fout.flush(); } @@ -3005,7 +3007,7 @@ class StabilityTracer void print_header() { //srt::sync::ScopedLock lck(m_mtx); - m_fout << "Timepoint,SocketID,weight,usLatency,usRTT,usRTTVar,usStabilityTimeout,usSinceLastResp,State,usSinceActivation\n"; + m_fout << "Timepoint,SocketID,weight,usLatency,usRTT,usRTTVar,usStabilityTimeout,usSinceLastResp,State,usSinceActivation,usActivationPeriod\n"; } void create_file() @@ -3054,8 +3056,9 @@ static int sendBackup_CheckRunningLinkStable(const CUDT& u, const srt::sync::ste // therefore it is incorrect to use the dymanic timeout. const uint32_t latency_us = u.peer_latency_us(); const uint32_t activation_period_us = latency_us + 50000; + //const int64_t since_activation_us = count_microseconds(currtime - u.FreshActivationStart()); const bool is_activation_phase = !is_zero(u.FreshActivationStart()) - && (count_microseconds(currtime - u.FreshActivationStart()) < activation_period_us); + && (count_microseconds(currtime - u.FreshActivationStart()) <= activation_period_us); const int32_t min_stability_us = 60000; // Minimum Link Stability Timeout: 60ms. const int peer_idle_tout_us = u.peer_idle_tout_ms() * 1000; @@ -3071,14 +3074,14 @@ static int sendBackup_CheckRunningLinkStable(const CUDT& u, const srt::sync::ste if (count_microseconds(td_response) > stability_tout_us) { #if SRT_DEBUG_BONDING_STATES - s_stab_trace.trace(u, currtime, stability_tout_us, "UNSTABLE", weight); + s_stab_trace.trace(u, currtime, activation_period_us, stability_tout_us, is_activation_phase ? "ACTIVATION-UNSTABLE" : "UNSTABLE", weight); #endif return -1; } // u.LastRspTime() > currtime is alwats true due to the very first check above in this function #if SRT_DEBUG_BONDING_STATES - s_stab_trace.trace(u, currtime, stability_tout_us, "STABLE", weight); + s_stab_trace.trace(u, currtime, activation_period_us, stability_tout_us, is_activation_phase ? "ACTIVATION" : "STABLE", weight); #endif return is_activation_phase ? 0 : 1; } From 07a89817bdb373524b2fd9ef7c5050dadf5d990e Mon Sep 17 00:00:00 2001 From: Maxim Sharabayko Date: Thu, 4 Feb 2021 12:42:53 +0100 Subject: [PATCH 5/7] Updated stability timout calculations --- srtcore/group.cpp | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/srtcore/group.cpp b/srtcore/group.cpp index 694cc6249..746280788 100644 --- a/srtcore/group.cpp +++ b/srtcore/group.cpp @@ -3036,40 +3036,28 @@ class StabilityTracer StabilityTracer s_stab_trace; #endif -/// TODO: Remove 'weight' parameter. Only needed for logging. +/// TODO: Remove 'weight' parameter? Only needed for logging. /// @retval 1 - link is identified as stable /// @retval 0 - link state remains unchanged (too early to identify, still in activation phase) /// @retval -1 - link is identified as unstable static int sendBackup_CheckRunningLinkStable(const CUDT& u, const srt::sync::steady_clock::time_point& currtime, uint16_t weight) { - // There was already a response from peer while we are here. - // m_tsUnstableSince = 0; - // Do we need to keep the activation phase? - /*if (currtime <= u.LastRspTime()) { -#if SRT_DEBUG_BONDING_STATES - s_stab_trace.trace(u, currtime, -1, "STABLE", weight); -#endif - return 1; - }*/ - - // RTT and RTTVar values during activation period are still being refined, - // therefore it is incorrect to use the dymanic timeout. const uint32_t latency_us = u.peer_latency_us(); - const uint32_t activation_period_us = latency_us + 50000; - //const int64_t since_activation_us = count_microseconds(currtime - u.FreshActivationStart()); + const int32_t min_stability_us = 60000; // Minimum Link Stability Timeout: 60ms. + const int64_t initial_stabtout_us = max(min_stability_us, latency_us); + const int64_t activation_period_us = initial_stabtout_us + 5 * CUDT::COMM_SYN_INTERVAL_US; + + // RTT and RTTVar values are still being refined during activation period, + // therefore the dymanic timeout should not be used in activation phase. const bool is_activation_phase = !is_zero(u.FreshActivationStart()) && (count_microseconds(currtime - u.FreshActivationStart()) <= activation_period_us); - const int32_t min_stability_us = 60000; // Minimum Link Stability Timeout: 60ms. const int peer_idle_tout_us = u.peer_idle_tout_ms() * 1000; - SRT_ASSERT(latency_us < peer_idle_tout_us); const int64_t stability_tout_us = is_activation_phase - ? min(max(min_stability_us, latency_us), peer_idle_tout_us) // activation phase + ? initial_stabtout_us // activation phase : min(max(min_stability_us, 2 * u.RTT() + 4 * u.RTTVar()), latency_us); - // TODO: td_response = currtime - max(u.LastRspTime(), u.FreshActivationStart()); - // TODO: remove m_iOptGroupStabTimeout const steady_clock::duration td_response = currtime - u.LastRspTime(); if (count_microseconds(td_response) > stability_tout_us) { @@ -3112,7 +3100,6 @@ bool CUDTGroup::sendBackup_CheckRunningStability(const gli_t d, const time_point << (!is_zero(u.m_tsUnstableSince) ? FormatDuration(currtime - u.m_tsUnstableSince) : "NEVER") << "}"); - const int is_stable = sendBackup_CheckRunningLinkStable(u, currtime, d->weight); if (is_stable >= 0) From 1ad8cfa0cefac5aaefcf1473b6d96a788a3c470b Mon Sep 17 00:00:00 2001 From: Maxim Sharabayko Date: Fri, 5 Feb 2021 12:40:17 +0100 Subject: [PATCH 6/7] Fixed stability check. Fixed naming. --- srtcore/core.cpp | 2 +- srtcore/core.h | 8 ++++---- srtcore/group.cpp | 19 +++++++++---------- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/srtcore/core.cpp b/srtcore/core.cpp index 27dfc6662..b478e1dba 100644 --- a/srtcore/core.cpp +++ b/srtcore/core.cpp @@ -11172,7 +11172,7 @@ bool CUDT::checkExpTimer(const steady_clock::time_point& currtime, int check_rea * (keepalive fix) * duB: * It seems there is confusion of the direction of the Response here. - * LastRspTime is supposed to be when receiving (data/ctrl) from peer + * lastRspTime is supposed to be when receiving (data/ctrl) from peer * as shown in processCtrl and processData, * Here we set because we sent something? * diff --git a/srtcore/core.h b/srtcore/core.h index 3dc097c2a..8c0a1a85e 100644 --- a/srtcore/core.h +++ b/srtcore/core.h @@ -378,8 +378,8 @@ class CUDT int32_t sndSeqNo() const { return m_iSndCurrSeqNo; } int32_t schedSeqNo() const { return m_iSndNextSeqNo; } bool overrideSndSeqNo(int32_t seq); - srt::sync::steady_clock::time_point LastRspTime() const { return m_tsLastRspTime; } - srt::sync::steady_clock::time_point FreshActivationStart() const { return m_tsFreshActivation; } + srt::sync::steady_clock::time_point lastRspTime() const { return m_tsLastRspTime; } + srt::sync::steady_clock::time_point freshActivationStart() const { return m_tsFreshActivation; } int32_t rcvSeqNo() const { return m_iRcvCurrSeqNo; } int flowWindowSize() const { return m_iFlowWindowSize; } @@ -388,8 +388,8 @@ class CUDT int64_t maxBandwidth() const { return m_llMaxBW; } int MSS() const { return m_iMSS; } - uint32_t peer_latency_us() const {return m_iPeerTsbPdDelay_ms * 1000; } - int peer_idle_tout_ms() const { return m_iOPT_PeerIdleTimeout; } + uint32_t peerLatency_us() const {return m_iPeerTsbPdDelay_ms * 1000; } + int peerIdleTimeout_ms() const { return m_iOPT_PeerIdleTimeout; } size_t maxPayloadSize() const { return m_iMaxSRTPayloadSize; } size_t OPT_PayloadSize() const { return m_zOPT_ExpPayloadSize; } int sndLossLength() { return m_pSndLossList->getLossLength(); } diff --git a/srtcore/group.cpp b/srtcore/group.cpp index 746280788..77529618e 100644 --- a/srtcore/group.cpp +++ b/srtcore/group.cpp @@ -2992,13 +2992,13 @@ class StabilityTracer m_fout << srt::sync::FormatTime(currtime) << ","; m_fout << u.id() << ","; m_fout << weight << ","; - m_fout << u.peer_latency_us() << ","; + m_fout << u.peerLatency_us() << ","; m_fout << u.RTT() << ","; m_fout << u.RTTVar() << ","; m_fout << stability_tmo_us << ","; - m_fout << count_microseconds(currtime - u.LastRspTime()) << ","; + m_fout << count_microseconds(currtime - u.lastRspTime()) << ","; m_fout << state << ","; - m_fout << (srt::sync::is_zero(u.FreshActivationStart()) ? -1 : (count_microseconds(currtime - u.FreshActivationStart()))) << ","; + m_fout << (srt::sync::is_zero(u.freshActivationStart()) ? -1 : (count_microseconds(currtime - u.freshActivationStart()))) << ","; m_fout << activation_period_us << "\n"; m_fout.flush(); } @@ -3042,23 +3042,22 @@ StabilityTracer s_stab_trace; /// @retval -1 - link is identified as unstable static int sendBackup_CheckRunningLinkStable(const CUDT& u, const srt::sync::steady_clock::time_point& currtime, uint16_t weight) { - const uint32_t latency_us = u.peer_latency_us(); + const uint32_t latency_us = u.peerLatency_us(); const int32_t min_stability_us = 60000; // Minimum Link Stability Timeout: 60ms. const int64_t initial_stabtout_us = max(min_stability_us, latency_us); const int64_t activation_period_us = initial_stabtout_us + 5 * CUDT::COMM_SYN_INTERVAL_US; // RTT and RTTVar values are still being refined during activation period, // therefore the dymanic timeout should not be used in activation phase. - const bool is_activation_phase = !is_zero(u.FreshActivationStart()) - && (count_microseconds(currtime - u.FreshActivationStart()) <= activation_period_us); - - const int peer_idle_tout_us = u.peer_idle_tout_ms() * 1000; + const bool is_activation_phase = !is_zero(u.freshActivationStart()) + && (count_microseconds(currtime - u.freshActivationStart()) <= activation_period_us); const int64_t stability_tout_us = is_activation_phase ? initial_stabtout_us // activation phase : min(max(min_stability_us, 2 * u.RTT() + 4 * u.RTTVar()), latency_us); - const steady_clock::duration td_response = currtime - u.LastRspTime(); + const steady_clock::time_point last_rsp = max(u.freshActivationStart(), u.lastRspTime()); + const steady_clock::duration td_response = currtime - last_rsp; if (count_microseconds(td_response) > stability_tout_us) { #if SRT_DEBUG_BONDING_STATES @@ -3067,7 +3066,7 @@ static int sendBackup_CheckRunningLinkStable(const CUDT& u, const srt::sync::ste return -1; } - // u.LastRspTime() > currtime is alwats true due to the very first check above in this function + // u.lastRspTime() > currtime is alwats true due to the very first check above in this function #if SRT_DEBUG_BONDING_STATES s_stab_trace.trace(u, currtime, activation_period_us, stability_tout_us, is_activation_phase ? "ACTIVATION" : "STABLE", weight); #endif From 6033e9072594e67cab0a69c4a5f980123db7f448 Mon Sep 17 00:00:00 2001 From: Maxim Sharabayko Date: Fri, 5 Feb 2021 17:34:54 +0100 Subject: [PATCH 7/7] Added a note to docs --- docs/APISocketOptions.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/APISocketOptions.md b/docs/APISocketOptions.md index 5e7313526..d78cec771 100644 --- a/docs/APISocketOptions.md +++ b/docs/APISocketOptions.md @@ -427,6 +427,8 @@ function will return the group, not this socket ID. | --------------------- | ----- | ------- | ---------- | ------ | -------- | ------ | --- | ------ | | `SRTO_GROUPSTABTIMEO` | 1.5.0 | pre | `int32_t` | ms | 80 | 10-... | W | GSD+ | +**Not in use at the moment. Is to be repurposed in SRT v1.4.3!** + This setting is used for groups of type `SRT_GTYPE_BACKUP`. It defines the stability timeout, which is the maximum interval between two consecutive packets retrieved from the peer on the currently active link. These two packets can be of any type,