Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error-checking for a couple of corruption issues #8059

Merged
merged 2 commits into from
Jun 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion python/ray/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import ray.ray_constants as ray_constants
from ray.utils import binary_to_hex, setup_logger
from ray.autoscaler.commands import teardown_cluster
import redis

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -161,7 +162,11 @@ def process_messages(self, max_messages=10000):
subscribe_clients = [self.primary_subscribe_client]
for subscribe_client in subscribe_clients:
for _ in range(max_messages):
message = subscribe_client.get_message()
message = None
try:
message = subscribe_client.get_message()
except redis.exceptions.ConnectionError:
pass
if message is None:
# Continue on to the next subscribe client.
break
Expand Down
5 changes: 5 additions & 0 deletions src/ray/core_worker/core_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,11 @@ void CoreWorkerProcess::EnsureInitialized() {
CoreWorker &CoreWorkerProcess::GetCoreWorker() {
EnsureInitialized();
if (instance_->options_.num_workers == 1) {
// TODO(mehrdadn): Remove this when the bug is resolved.
// Somewhat consistently reproducible via
// python/ray/tests/test_basic.py::test_background_tasks_with_max_calls
// with -c opt on Windows.
RAY_CHECK(instance_->global_worker_) << "global_worker_ must not be NULL";
return *instance_->global_worker_;
}
auto ptr = current_core_worker_.lock();
Expand Down
6 changes: 4 additions & 2 deletions src/ray/gcs/asio.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ void RedisAsioClient::operate() {

void RedisAsioClient::handle_read(boost::system::error_code error_code) {
RAY_CHECK(!error_code || error_code == boost::asio::error::would_block ||
error_code == boost::asio::error::connection_reset);
error_code == boost::asio::error::connection_reset)
<< "handle_read(error_code = " << error_code << ")";
read_in_progress_ = false;
redis_async_context_.RedisAsyncHandleRead();

Expand All @@ -90,7 +91,8 @@ void RedisAsioClient::handle_read(boost::system::error_code error_code) {

void RedisAsioClient::handle_write(boost::system::error_code error_code) {
RAY_CHECK(!error_code || error_code == boost::asio::error::would_block ||
error_code == boost::asio::error::connection_reset);
error_code == boost::asio::error::connection_reset)
<< "handle_write(error_code = " << error_code << ")";
write_in_progress_ = false;
redis_async_context_.RedisAsyncHandleWrite();

Expand Down
6 changes: 5 additions & 1 deletion src/ray/gcs/redis_async_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ void RedisAsyncContext::RedisAsyncHandleRead() {
// This function will execute the callbacks which are registered by
// `redisvAsyncCommand`, `redisAsyncCommandArgv` and so on.
std::lock_guard<std::mutex> lock(mutex_);

// TODO(mehrdadn): Remove this when the bug is resolved.
// Somewhat consistently reproducible via
// python/ray/tests/test_basic.py::test_background_tasks_with_max_calls
// with -c opt on Windows.
RAY_CHECK(redis_async_context_) << "redis_async_context_ must not be NULL here";
redisAsyncHandleRead(redis_async_context_);
}

Expand Down