Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fix rollback by mistake for mutation_log::reset_from #1208

Merged
merged 10 commits into from
Oct 28, 2022
24 changes: 17 additions & 7 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -491,13 +491,6 @@ function run_start_zk()
type nc >/dev/null 2>&1 || { echo >&2 "start zk failed, need install netcat command..."; exit 1;}

INSTALL_DIR=`pwd`/.zk_install
if [ ! -d "${INSTALL_DIR}/zookeeper-bin" ]; then
if [ -d "zookeeper-bin" ]; then
# this zookeeper-bin must have been got from github action workflows, thus just
# move it to ${INSTALL_DIR} to prevent from downloading
mv zookeeper-bin ${INSTALL_DIR}/
fi
fi
PORT=22181
while [[ $# > 0 ]]; do
key="$1"
Expand All @@ -524,6 +517,23 @@ function run_start_zk()
shift
done

if [ ! -d "${INSTALL_DIR}/zookeeper-bin" ]; then
echo "zookeeper-bin cannot be found under ${INSTALL_DIR}, thus try to find an existing one"

if [ -d "zookeeper-bin" ]; then
echo "zookeeper-bin is found under current work dir `pwd`, just use this one"

if ! mkdir -p "${INSTALL_DIR}"; then
echo "ERROR: mkdir ${INSTALL_DIR} failed"
exit 1
fi

# this zookeeper-bin must have been got from github action workflows, thus just
# move it to ${INSTALL_DIR} to prevent from downloading
mv zookeeper-bin ${INSTALL_DIR}/
fi
fi

INSTALL_DIR="$INSTALL_DIR" PORT="$PORT" $ROOT/scripts/start_zk.sh
}

Expand Down
68 changes: 39 additions & 29 deletions src/replica/mutation_log.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -939,58 +939,68 @@ error_code mutation_log::reset_from(const std::string &dir,
replay_callback replay_error_callback,
io_failure_callback write_error_callback)
{
error_code err = ERR_FILE_OPERATION_FAILED;

// close for flushing current log and be ready to open new log files after reset
// Close for flushing current log and get ready to open new log files after reset.
close();

// make sure logs in `dir` (such as /learn) are valid.
// Ensure that log files in `dir` (such as "/learn") are valid.
error_s es = log_utils::check_log_files_continuity(dir);
if (!es.is_ok()) {
LOG_ERROR_F("the log of source dir {} is invalid:{}, will remove it.", dir, es);
LOG_ERROR_F("the log files of source dir {} are invalid: {}, will remove it", dir, es);
if (!utils::filesystem::remove_path(dir)) {
acelyc111 marked this conversation as resolved.
Show resolved Hide resolved
LOG_ERROR_F("remove {} failed", dir);
return err;
LOG_ERROR_F("remove source dir {} failed", dir);
return ERR_FILE_OPERATION_FAILED;
}
return es.code();
}

std::string temp_dir = _dir + '.' + std::to_string(dsn_now_ns());
std::string temp_dir = fmt::format("{}.{}", _dir, dsn_now_ns());
if (!utils::filesystem::rename_path(_dir, temp_dir)) {
LOG_ERROR_F("rename {} to {} failed", _dir, temp_dir);
return err;
LOG_ERROR_F("rename current log dir {} to temp dir {} failed", _dir, temp_dir);
return ERR_FILE_OPERATION_FAILED;
}
LOG_INFO_F("moved current log dir {} to tmp_dir {}", _dir, temp_dir);
// define `defer` for rollback temp_dir when failed or remove temp_dir when success
auto temp_dir_resolve = dsn::defer([this, err, temp_dir]() {
if (err != ERR_OK) {
if (!utils::filesystem::rename_path(temp_dir, _dir)) {
// rollback failed means old log files are not be recovered, it may be lost if only
// LOG_ERROR, dassert for manual resolve it
// TODO(yingchun): will be fixed later
// CHECK(false, "rollback {} to {} failed", temp_dir, _dir);
}
} else {
LOG_INFO_F("rename current log dir {} to temp dir {}", _dir, temp_dir);

error_code err = ERR_OK;

// If successful, just remove temp dir; otherwise, rename temp dir back to current dir.
auto temp_dir_resolve = dsn::defer([this, temp_dir, &err]() {
if (err == ERR_OK) {
if (!dsn::utils::filesystem::remove_path(temp_dir)) {
// temp dir allow delete failed, it's only garbage
// Removing temp dir failed is allowed, it's just garbage.
LOG_ERROR_F("remove temp dir {} failed", temp_dir);
}
} else {
// Once rollback failed, dir should be recovered manually in case data is lost.
CHECK(utils::filesystem::rename_path(temp_dir, _dir),
"rename temp dir {} back to current dir {} failed",
temp_dir,
_dir);
}
});

// move source dir to target dir
// Rename source dir to current dir.
if (!utils::filesystem::rename_path(dir, _dir)) {
LOG_ERROR_F("rename {} to {} failed", dir, _dir);
LOG_ERROR_F("rename source dir {} to current dir {} failed", dir, _dir);
return err;
}
LOG_INFO_F("move {} to {} as our new log directory", dir, _dir);
LOG_INFO_F("rename source dir {} to current dir {} successfully", dir, _dir);

auto dir_resolve = dsn::defer([this, dir, &err]() {
if (err != ERR_OK) {
CHECK(utils::filesystem::rename_path(_dir, dir),
"rename current dir {} back to source dir {} failed",
_dir,
dir);
}
});

// - make sure logs in moved dir(such as /plog) are valid and can be opened successfully.
// - re-open new log files for loading the new log file and register the files into replica,
// please make sure the old log files has been closed
// 1. ensure that logs in current dir(such as "/plog") are valid and can be opened
// successfully;
// 2. reopen, load and register new log files into replica;
// 3. be sure that the old log files should have been closed.
err = open(replay_error_callback, write_error_callback);
if (err != ERR_OK) {
LOG_ERROR_F("the logs of moved dir {} are invalid and open failed:{}", _dir, err);
LOG_ERROR_F("the log files of current dir {} are invalid, thus open failed: {}", _dir, err);
}
return err;
}
Expand Down