Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

shell: add ddd_diagnose command #175

Merged
merged 4 commits into from
Sep 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 229 additions & 0 deletions src/shell/commands.h
Original file line number Diff line number Diff line change
Expand Up @@ -4310,3 +4310,232 @@ inline bool clear_app_envs(command_executor *e, shell_context *sc, arguments arg
}
return true;
}

inline dsn::rpc_address diagnose_recommend(const ddd_partition_info &pinfo)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

加个注释吧

// Select a best node to be the new primary after a DDD situation, following the rule:
//  - choose the node with largest committed decree
//  - if committed decree is the same, choose the node with largest ballot

不知道规则有没有理解对,另外在 pr description 里面描述一下 DDD 推荐的规则吧,以后也可以写到文档里

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

{
if (pinfo.config.last_drops.size() < 2)
return dsn::rpc_address();

std::vector<dsn::rpc_address> last_two_nodes(pinfo.config.last_drops.end() - 2,
pinfo.config.last_drops.end());
std::vector<ddd_node_info> last_dropped;
for (auto &node : last_two_nodes) {
auto it = std::find_if(pinfo.dropped.begin(),
pinfo.dropped.end(),
[&node](const ddd_node_info &r) { return r.node == node; });
if (it->is_alive && it->is_collected)
last_dropped.push_back(*it);
}

if (last_dropped.size() == 1) {
const ddd_node_info &ninfo = last_dropped.back();
if (ninfo.last_committed_decree >= pinfo.config.last_committed_decree)
return ninfo.node;
} else if (last_dropped.size() == 2) {
const ddd_node_info &secondary = last_dropped.front();
const ddd_node_info &latest = last_dropped.back();

// Select a best node to be the new primary, following the rule:
// - choose the node with the largest last committed decree
// - if last committed decree is the same, choose node with the largest ballot

if (latest.last_committed_decree == secondary.last_committed_decree &&
latest.last_committed_decree >= pinfo.config.last_committed_decree)
return latest.ballot >= secondary.ballot ? latest.node : secondary.node;

if (latest.last_committed_decree > secondary.last_committed_decree &&
latest.last_committed_decree >= pinfo.config.last_committed_decree)
return latest.node;

if (secondary.last_committed_decree > latest.last_committed_decree &&
secondary.last_committed_decree >= pinfo.config.last_committed_decree)
return secondary.node;
}

return dsn::rpc_address();
}

inline bool ddd_diagnose(command_executor *e, shell_context *sc, arguments args)
{
static struct option long_options[] = {{"gpid", required_argument, 0, 'g'},
{"diagnose", no_argument, 0, 'd'},
{"auto_diagnose", no_argument, 0, 'a'},
{"skip_prompt", no_argument, 0, 's'},
{"output", required_argument, 0, 'o'},
{0, 0, 0, 0}};

std::string out_file;
dsn::gpid id(-1, -1);
bool diagnose = false;
bool auto_diagnose = false;
bool skip_prompt = false;
optind = 0;
while (true) {
int option_index = 0;
int c;
c = getopt_long(args.argc, args.argv, "g:daso:", long_options, &option_index);
if (c == -1)
break;
switch (c) {
case 'g':
int pid;
if (id.parse_from(optarg)) {
// app_id.partition_index
} else if (sscanf(optarg, "%d", &pid) == 1) {
// app_id
id.set_app_id(pid);
} else {
fprintf(stderr, "ERROR: invalid gpid %s\n", optarg);
return false;
}
break;
case 'd':
diagnose = true;
break;
case 'a':
auto_diagnose = true;
break;
case 's':
skip_prompt = true;
break;
case 'o':
out_file = optarg;
break;
default:
return false;
}
}

std::vector<ddd_partition_info> ddd_partitions;
::dsn::error_code ret = sc->ddl_client->ddd_diagnose(id, ddd_partitions);
if (ret != dsn::ERR_OK) {
fprintf(stderr, "ERROR: DDD diagnose failed with err = %s\n", ret.to_string());
return true;
}

std::streambuf *buf;
std::ofstream of;

if (!out_file.empty()) {
of.open(out_file);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

of使用完毕需要close掉

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

不用自己close,对象析构时会自动close

buf = of.rdbuf();
} else {
buf = std::cout.rdbuf();
}
std::ostream out(buf);

out << "Total " << ddd_partitions.size() << " ddd partitions:" << std::endl;
out << std::endl;
int proposed_count = 0;
int i = 0;
for (const ddd_partition_info &pinfo : ddd_partitions) {
out << "(" << ++i << ") " << pinfo.config.pid.to_string() << std::endl;
out << " config: ballot(" << pinfo.config.ballot << "), "
<< "last_committed(" << pinfo.config.last_committed_decree << ")" << std::endl;
out << " ----" << std::endl;
dsn::rpc_address latest_dropped, secondary_latest_dropped;
if (pinfo.config.last_drops.size() > 0)
latest_dropped = pinfo.config.last_drops[pinfo.config.last_drops.size() - 1];
if (pinfo.config.last_drops.size() > 1)
secondary_latest_dropped = pinfo.config.last_drops[pinfo.config.last_drops.size() - 2];
int j = 0;
for (const ddd_node_info &n : pinfo.dropped) {
char time_buf[30];
::dsn::utils::time_ms_to_string(n.drop_time_ms, time_buf);
out << " dropped[" << j++ << "]: "
<< "node(" << n.node.to_string() << "), "
<< "drop_time(" << time_buf << "), "
<< "alive(" << (n.is_alive ? "true" : "false") << "), "
<< "collected(" << (n.is_collected ? "true" : "false") << "), "
<< "ballot(" << n.ballot << "), "
<< "last_committed(" << n.last_committed_decree << "), "
<< "last_prepared(" << n.last_prepared_decree << ")";
if (n.node == latest_dropped)
out << " <== the latest";
else if (n.node == secondary_latest_dropped)
out << " <== the secondary latest";
out << std::endl;
}
out << " ----" << std::endl;
j = 0;
for (const ::dsn::rpc_address &r : pinfo.config.last_drops) {
out << " last_drops[" << j++ << "]: "
<< "node(" << r.to_string() << ")";
if (j == (int)pinfo.config.last_drops.size() - 1)
out << " <== the secondary latest";
else if (j == (int)pinfo.config.last_drops.size())
out << " <== the latest";
out << std::endl;
}
out << " ----" << std::endl;
out << " ddd_reason: " << pinfo.reason << std::endl;
if (diagnose) {
out << " ----" << std::endl;

dsn::rpc_address primary = diagnose_recommend(pinfo);
out << " recommend_primary: "
<< (primary.is_invalid() ? "none" : primary.to_string());
if (primary == latest_dropped)
out << " <== the latest";
else if (primary == secondary_latest_dropped)
out << " <== the secondary latest";
out << std::endl;

bool skip_this = false;
if (!primary.is_invalid() && !auto_diagnose && !skip_prompt) {
do {
std::cout << " > Are you sure to use the recommend primary? [y/n/s(skip)]: ";
char c;
std::cin >> c;
if (c == 'y') {
break;
} else if (c == 'n') {
primary.set_invalid();
break;
} else if (c == 's') {
skip_this = true;
std::cout << " > You have choosed to skip diagnosing this partition."
<< std::endl;
break;
}
} while (true);
}

if (primary.is_invalid() && !skip_prompt && !skip_this) {
do {
std::cout << " > Please input the primary node: ";
std::string addr;
std::cin >> addr;
if (primary.from_string_ipv4(addr.c_str())) {
break;
} else {
std::cout << " > Sorry, you have input an invalid node address."
<< std::endl;
}
} while (true);
}

if (!primary.is_invalid() && !skip_this) {
dsn::replication::configuration_balancer_request request;
request.gpid = pinfo.config.pid;
request.action_list = {configuration_proposal_action{
primary, primary, config_type::CT_ASSIGN_PRIMARY}};
request.force = false;
dsn::error_code err = sc->ddl_client->send_balancer_proposal(request);
out << " propose_request: propose -g " << request.gpid.to_string()
<< " -p ASSIGN_PRIMARY -t " << primary.to_string() << " -n "
<< primary.to_string() << std::endl;
out << " propose_response: " << err.to_string() << std::endl;
proposed_count++;
} else {
out << " propose_request: none" << std::endl;
}
}
out << std::endl;
out << "Proposed count: " << proposed_count << "/" << ddd_partitions.size() << std::endl;
out << std::endl;
}

std::cout << "Diagnose ddd done." << std::endl;
return true;
}
9 changes: 8 additions & 1 deletion src/shell/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,14 @@ static command_executor commands[] = {
"del_app_envs", "delete current app envs", "<key> [key...]", del_app_envs,
},
{
"clear_app_envs", "clear current app envs", "<-a|--all> <-p|--prefix str>", clear_app_envs,
"clear_app_envs", "clear current app envs", "[-a|--all] [-p|--prefix str]", clear_app_envs,
},
{
"ddd_diagnose",
"diagnose three-dead partitions",
"[-g|--gpid appid|appid.pidx] [-d|--diagnose] [-a|--auto_diagnose] "
"[-s|--skip_prompt] [-o|--output file_name]",
ddd_diagnose,
},
{
"exit", "exit shell", "", exit_shell,
Expand Down