Skip to content

Commit

Permalink
Handle OutOfMemory runner errors with gVisor
Browse files Browse the repository at this point in the history
  • Loading branch information
MrSerth committed Jul 6, 2023
1 parent 891af41 commit 83fcc73
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 19 deletions.
28 changes: 11 additions & 17 deletions app/controllers/submissions_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ def render_file
end
end

# rubocop:disable Metrics/CyclomaticComplexity
def run
def run # rubocop:disable Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
# These method-local socket variables are required in order to use one socket
# in the callbacks of the other socket. As the callbacks for the client socket
# are registered first, the runner socket may still be nil.
Expand Down Expand Up @@ -199,12 +198,6 @@ def run
end
stream = @testrun[:status] == :ok ? :stdout : :stderr
send_and_store client_socket, {cmd: :write, stream:, data: "#{exit_statement}\n"}
if exit_code == 137
send_and_store client_socket, {cmd: :status, status: :out_of_memory}
@testrun[:status] = :out_of_memory
end

# The client connection will be closed once the file listing finished.
end

runner_socket.on :files do |files|
Expand All @@ -213,30 +206,33 @@ def run
js_tree = FileTree.new(downloadable_files).to_js_tree
send_and_store client_socket, {cmd: :files, data: js_tree}
end

close_client_connection(client_socket)
end
end
@testrun[:container_execution_time] = durations[:execution_duration]
@testrun[:waiting_for_container_time] = durations[:waiting_duration]
rescue Runner::Error::ExecutionTimeout => e
send_and_store client_socket, {cmd: :status, status: :timeout}
close_client_connection(client_socket)
Rails.logger.debug { "Running a submission timed out: #{e.message}" }
@testrun[:status] ||= :timeout
@testrun[:output] = "timeout: #{@testrun[:output]}"
extract_durations(e)
rescue Runner::Error::OutOfMemory => e
send_and_store client_socket, {cmd: :status, status: :out_of_memory}
Rails.logger.debug { "Running a submission caused an out of memory error: #{e.message}" }
@testrun[:status] ||= :out_of_memory
@testrun[:exit_code] ||= 137
@testrun[:output] = "out_of_memory: #{@testrun[:output]}"
extract_durations(e)
rescue Runner::Error => e
# Regardless of the specific error cause, we send a `container_depleted` status to the client.
send_and_store client_socket, {cmd: :status, status: :container_depleted}
close_client_connection(client_socket)
@testrun[:status] ||= :container_depleted
Rails.logger.debug { "Runner error while running a submission: #{e.message}" }
extract_durations(e)
ensure
close_client_connection(client_socket)
save_testrun_output 'run'
end
# rubocop:enable Metrics/CyclomaticComplexity:

def score
client_socket = nil
Expand All @@ -256,14 +252,13 @@ def score
client_socket&.send_data(JSON.dump(@submission.calculate_score))
# To enable hints when scoring a submission, uncomment the next line:
# send_hints(client_socket, StructuredError.where(submission: @submission))
kill_client_socket(client_socket)
rescue Runner::Error => e
extract_durations(e)
send_and_store client_socket, {cmd: :status, status: :container_depleted}
kill_client_socket(client_socket)
Rails.logger.debug { "Runner error while scoring submission #{@submission.id}: #{e.message}" }
@testrun[:passed] = false
ensure
kill_client_socket(client_socket)
save_testrun_output 'assess'
end

Expand All @@ -290,14 +285,13 @@ def test

# The score is stored separately, we can forward it to the client immediately
client_socket&.send_data(JSON.dump(@submission.test(@file)))
kill_client_socket(client_socket)
rescue Runner::Error => e
extract_durations(e)
send_and_store client_socket, {cmd: :status, status: :container_depleted}
kill_client_socket(client_socket)
Rails.logger.debug { "Runner error while testing submission #{@submission.id}: #{e.message}" }
@testrun[:passed] = false
ensure
kill_client_socket(client_socket)
save_testrun_output 'assess'
end

Expand Down
2 changes: 2 additions & 0 deletions app/errors/runner/error.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class UnexpectedResponse < Error; end

class WorkspaceError < Error; end

class OutOfMemory < Error; end

class Unknown < Error; end
end
end
3 changes: 3 additions & 0 deletions app/models/runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ def execute_command(command, privileged_execution: false, raise_exception: true)
rescue Runner::Error::ExecutionTimeout => e
Rails.logger.debug { "Running command `#{command}` timed out: #{e.message}" }
output.merge!(status: :timeout, container_execution_time: e.execution_duration)
rescue Runner::Error::OutOfMemory => e
Rails.logger.debug { "Running command `#{command}` caused an out of memory error: #{e.message}" }
output.merge!(status: :out_of_memory, container_execution_time: e.execution_duration)
rescue Runner::Error::RunnerNotFound => e
Rails.logger.debug { "Running command `#{command}` failed for the first time: #{e.message}" }
try += 1
Expand Down
18 changes: 16 additions & 2 deletions lib/runner/connection.rb
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,14 @@ def on_close(event, sentry_span)
@strategy.destroy_at_management
@error = Runner::Error::ExecutionTimeout.new('Execution exceeded its time limit')
when :terminated_by_codeocean, :terminated_by_management
@exit_callback.call @exit_code
list_filesystem
# Poseidon (without gVisor) and DockerContainerPool do not handle memory limits explicitly.
# Instead, they signal that the program was terminated with exit code 137 (128 + 9).
if @exit_code == 137
@error = Runner::Error::OutOfMemory.new('Execution exceeded its memory limit')
else
@exit_callback.call @exit_code
list_filesystem
end
when :terminated_by_client, :error
@strategy.destroy_at_management
else # :established
Expand Down Expand Up @@ -223,6 +229,14 @@ def handle_stderr(event)
end

def handle_error(event)
# Poseidon (with gVisor enabled!) sends an error message when the execution exceeds its memory limit.
# This is not an error in the sense of the runner management but rather a message.
# We handle it here to avoid the error handling in the default case.
if event[:data] == 'the allocation was OOM Killed'
@error = Runner::Error::OutOfMemory.new('Execution exceeded its memory limit')
return
end

# In case of a (Nomad) error during execution, the runner management will notify us with an error message here.
# This shouldn't happen too often and can be considered an internal server error by the runner management.
# More information is available in the logs of the runner management or the orchestrator (e.g., Nomad).
Expand Down

0 comments on commit 83fcc73

Please sign in to comment.