diff --git a/rdsn b/rdsn index 0ccfd9736d..16addfc0ad 160000 --- a/rdsn +++ b/rdsn @@ -1 +1 @@ -Subproject commit 0ccfd9736d2fc950df99b860a0e55f61a9d33de6 +Subproject commit 16addfc0ada004cab3b7366fc46f9b93319d989d diff --git a/src/shell/commands.h b/src/shell/commands.h index 7ce27982e0..19be0d00bd 100644 --- a/src/shell/commands.h +++ b/src/shell/commands.h @@ -4310,3 +4310,232 @@ inline bool clear_app_envs(command_executor *e, shell_context *sc, arguments arg } return true; } + +inline dsn::rpc_address diagnose_recommend(const ddd_partition_info &pinfo) +{ + if (pinfo.config.last_drops.size() < 2) + return dsn::rpc_address(); + + std::vector last_two_nodes(pinfo.config.last_drops.end() - 2, + pinfo.config.last_drops.end()); + std::vector last_dropped; + for (auto &node : last_two_nodes) { + auto it = std::find_if(pinfo.dropped.begin(), + pinfo.dropped.end(), + [&node](const ddd_node_info &r) { return r.node == node; }); + if (it->is_alive && it->is_collected) + last_dropped.push_back(*it); + } + + if (last_dropped.size() == 1) { + const ddd_node_info &ninfo = last_dropped.back(); + if (ninfo.last_committed_decree >= pinfo.config.last_committed_decree) + return ninfo.node; + } else if (last_dropped.size() == 2) { + const ddd_node_info &secondary = last_dropped.front(); + const ddd_node_info &latest = last_dropped.back(); + + // Select a best node to be the new primary, following the rule: + // - choose the node with the largest last committed decree + // - if last committed decree is the same, choose node with the largest ballot + + if (latest.last_committed_decree == secondary.last_committed_decree && + latest.last_committed_decree >= pinfo.config.last_committed_decree) + return latest.ballot >= secondary.ballot ? latest.node : secondary.node; + + if (latest.last_committed_decree > secondary.last_committed_decree && + latest.last_committed_decree >= pinfo.config.last_committed_decree) + return latest.node; + + if (secondary.last_committed_decree > latest.last_committed_decree && + secondary.last_committed_decree >= pinfo.config.last_committed_decree) + return secondary.node; + } + + return dsn::rpc_address(); +} + +inline bool ddd_diagnose(command_executor *e, shell_context *sc, arguments args) +{ + static struct option long_options[] = {{"gpid", required_argument, 0, 'g'}, + {"diagnose", no_argument, 0, 'd'}, + {"auto_diagnose", no_argument, 0, 'a'}, + {"skip_prompt", no_argument, 0, 's'}, + {"output", required_argument, 0, 'o'}, + {0, 0, 0, 0}}; + + std::string out_file; + dsn::gpid id(-1, -1); + bool diagnose = false; + bool auto_diagnose = false; + bool skip_prompt = false; + optind = 0; + while (true) { + int option_index = 0; + int c; + c = getopt_long(args.argc, args.argv, "g:daso:", long_options, &option_index); + if (c == -1) + break; + switch (c) { + case 'g': + int pid; + if (id.parse_from(optarg)) { + // app_id.partition_index + } else if (sscanf(optarg, "%d", &pid) == 1) { + // app_id + id.set_app_id(pid); + } else { + fprintf(stderr, "ERROR: invalid gpid %s\n", optarg); + return false; + } + break; + case 'd': + diagnose = true; + break; + case 'a': + auto_diagnose = true; + break; + case 's': + skip_prompt = true; + break; + case 'o': + out_file = optarg; + break; + default: + return false; + } + } + + std::vector ddd_partitions; + ::dsn::error_code ret = sc->ddl_client->ddd_diagnose(id, ddd_partitions); + if (ret != dsn::ERR_OK) { + fprintf(stderr, "ERROR: DDD diagnose failed with err = %s\n", ret.to_string()); + return true; + } + + std::streambuf *buf; + std::ofstream of; + + if (!out_file.empty()) { + of.open(out_file); + buf = of.rdbuf(); + } else { + buf = std::cout.rdbuf(); + } + std::ostream out(buf); + + out << "Total " << ddd_partitions.size() << " ddd partitions:" << std::endl; + out << std::endl; + int proposed_count = 0; + int i = 0; + for (const ddd_partition_info &pinfo : ddd_partitions) { + out << "(" << ++i << ") " << pinfo.config.pid.to_string() << std::endl; + out << " config: ballot(" << pinfo.config.ballot << "), " + << "last_committed(" << pinfo.config.last_committed_decree << ")" << std::endl; + out << " ----" << std::endl; + dsn::rpc_address latest_dropped, secondary_latest_dropped; + if (pinfo.config.last_drops.size() > 0) + latest_dropped = pinfo.config.last_drops[pinfo.config.last_drops.size() - 1]; + if (pinfo.config.last_drops.size() > 1) + secondary_latest_dropped = pinfo.config.last_drops[pinfo.config.last_drops.size() - 2]; + int j = 0; + for (const ddd_node_info &n : pinfo.dropped) { + char time_buf[30]; + ::dsn::utils::time_ms_to_string(n.drop_time_ms, time_buf); + out << " dropped[" << j++ << "]: " + << "node(" << n.node.to_string() << "), " + << "drop_time(" << time_buf << "), " + << "alive(" << (n.is_alive ? "true" : "false") << "), " + << "collected(" << (n.is_collected ? "true" : "false") << "), " + << "ballot(" << n.ballot << "), " + << "last_committed(" << n.last_committed_decree << "), " + << "last_prepared(" << n.last_prepared_decree << ")"; + if (n.node == latest_dropped) + out << " <== the latest"; + else if (n.node == secondary_latest_dropped) + out << " <== the secondary latest"; + out << std::endl; + } + out << " ----" << std::endl; + j = 0; + for (const ::dsn::rpc_address &r : pinfo.config.last_drops) { + out << " last_drops[" << j++ << "]: " + << "node(" << r.to_string() << ")"; + if (j == (int)pinfo.config.last_drops.size() - 1) + out << " <== the secondary latest"; + else if (j == (int)pinfo.config.last_drops.size()) + out << " <== the latest"; + out << std::endl; + } + out << " ----" << std::endl; + out << " ddd_reason: " << pinfo.reason << std::endl; + if (diagnose) { + out << " ----" << std::endl; + + dsn::rpc_address primary = diagnose_recommend(pinfo); + out << " recommend_primary: " + << (primary.is_invalid() ? "none" : primary.to_string()); + if (primary == latest_dropped) + out << " <== the latest"; + else if (primary == secondary_latest_dropped) + out << " <== the secondary latest"; + out << std::endl; + + bool skip_this = false; + if (!primary.is_invalid() && !auto_diagnose && !skip_prompt) { + do { + std::cout << " > Are you sure to use the recommend primary? [y/n/s(skip)]: "; + char c; + std::cin >> c; + if (c == 'y') { + break; + } else if (c == 'n') { + primary.set_invalid(); + break; + } else if (c == 's') { + skip_this = true; + std::cout << " > You have choosed to skip diagnosing this partition." + << std::endl; + break; + } + } while (true); + } + + if (primary.is_invalid() && !skip_prompt && !skip_this) { + do { + std::cout << " > Please input the primary node: "; + std::string addr; + std::cin >> addr; + if (primary.from_string_ipv4(addr.c_str())) { + break; + } else { + std::cout << " > Sorry, you have input an invalid node address." + << std::endl; + } + } while (true); + } + + if (!primary.is_invalid() && !skip_this) { + dsn::replication::configuration_balancer_request request; + request.gpid = pinfo.config.pid; + request.action_list = {configuration_proposal_action{ + primary, primary, config_type::CT_ASSIGN_PRIMARY}}; + request.force = false; + dsn::error_code err = sc->ddl_client->send_balancer_proposal(request); + out << " propose_request: propose -g " << request.gpid.to_string() + << " -p ASSIGN_PRIMARY -t " << primary.to_string() << " -n " + << primary.to_string() << std::endl; + out << " propose_response: " << err.to_string() << std::endl; + proposed_count++; + } else { + out << " propose_request: none" << std::endl; + } + } + out << std::endl; + out << "Proposed count: " << proposed_count << "/" << ddd_partitions.size() << std::endl; + out << std::endl; + } + + std::cout << "Diagnose ddd done." << std::endl; + return true; +} diff --git a/src/shell/main.cpp b/src/shell/main.cpp index ba84d3850b..47ef35b8a8 100644 --- a/src/shell/main.cpp +++ b/src/shell/main.cpp @@ -375,7 +375,14 @@ static command_executor commands[] = { "del_app_envs", "delete current app envs", " [key...]", del_app_envs, }, { - "clear_app_envs", "clear current app envs", "<-a|--all> <-p|--prefix str>", clear_app_envs, + "clear_app_envs", "clear current app envs", "[-a|--all] [-p|--prefix str]", clear_app_envs, + }, + { + "ddd_diagnose", + "diagnose three-dead partitions", + "[-g|--gpid appid|appid.pidx] [-d|--diagnose] [-a|--auto_diagnose] " + "[-s|--skip_prompt] [-o|--output file_name]", + ddd_diagnose, }, { "exit", "exit shell", "", exit_shell,