Skip to content

Commit

Permalink
[qob] more debugging information on UnexpectedEOFError in Azure (#13160)
Browse files Browse the repository at this point in the history
Alright, I snagged the PR namespace from the CI:

```
pr-13135-default-u5tt5011yt5w
```

Then I went to the Azure [Log Analytics workspace
haildev-logs](https://portal.azure.com/#@haildev.onmicrosoft.com/resource/subscriptions/22cd45fe-f996-4c51-af67-ef329d977519/resourceGroups/haildev/providers/Microsoft.OperationalInsights/workspaces/haildev-logs/logs).

I went to "Queries", selected "DK's AKS Pod Logs", modified the
namespace to the aforementioned one, and added a filter for
"hail-az://".

```
let startTimestamp = ago(2h);
KubePodInventory
| where TimeGenerated > startTimestamp
| extend PodName=Name
| where Namespace == "pr-13135-default-u5tt5011yt5w" and PodName startswith "batch-driver"
| distinct ContainerID, PodName, Namespace
| join (
   ContainerLog
    | where TimeGenerated > startTimestamp
) on ContainerID
| project TimeGenerated, message=parse_json(LogEntry).message, LogEntry=parse_json(LogEntry)
| where message contains "hail-az://"
| order by TimeGenerated desc
```

That revealed the batch logs path:

```
EXAMPLE BATCH_JOB_LOGS_PATH hail-az://haildevtest/test/batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1/1/abc123/main/log
```

In the [failing PR test job
logs](https://ci.azure.hail.is/batches/3956877/jobs/152), I found the
batch id:

```
[2023-06-09 12:43:34] test/hail/methods/test_impex.py::BGENTests::test_import_bgen_row_fields
-------------------------------- live log call ---------------------------------
INFO     batch_client.aioclient:aioclient.py:753 created batch 1148

INFO     batch_client.aioclient:aioclient.py:770 updated batch 1148

FAILED
```

I listed the job logs:

```
(base) dking@wm28c-761 hail % az storage blob list --account-name haildevtest --container test --prefix batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/ -o table
Name                                                                           Blob Type    Blob Tier    Length    Content Type              Last Modified              Snapshot
-----------------------------------------------------------------------------  -----------  -----------  --------  ------------------------  -------------------------  ----------
batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/1/i4CoSh/main/log                 BlockBlob    Hot          11724     application/octet-stream  2023-06-09T12:43:36+00:00
batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/1/i4CoSh/main/resource_usage      BlockBlob    Hot          64        application/octet-stream  2023-06-09T12:43:36+00:00
batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/1/i4CoSh/status.json              BlockBlob    Hot          1240      application/octet-stream  2023-06-09T12:43:36+00:00
batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/2/31Owgv/main/log                 BlockBlob    Hot          16626     application/octet-stream  2023-06-09T12:44:22+00:00
batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/2/31Owgv/main/resource_usage      BlockBlob    Hot          680       application/octet-stream  2023-06-09T12:44:22+00:00
batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/2/31Owgv/status.json              BlockBlob    Hot          4453      application/octet-stream  2023-06-09T12:44:22+00:00
batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/bunch/dK3o5ZfXmYSkP5TA/specs      BlockBlob    Hot          1264      application/octet-stream  2023-06-09T12:43:37+00:00
batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/bunch/dK3o5ZfXmYSkP5TA/specs.idx  BlockBlob    Hot          16        application/octet-stream  2023-06-09T12:43:37+00:00
batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/bunch/eOrFpVrN98GBIizi/specs      BlockBlob    Hot          1264      application/octet-stream  2023-06-09T12:43:34+00:00
batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/bunch/eOrFpVrN98GBIizi/specs.idx  BlockBlob    Hot          16        application/octet-stream  2023-06-09T12:43:34+00:00
```

I looked at the status:

```
az storage blob download --account-name haildevtest --container test --name batch/logs/we5a79QlczzdluUx8kT2Vh/batch/1148/2/31Owgv/status.json | jq '.' | less
```

which contained an error (I un-escaped the string here):

```
JVMUserError: java.util.concurrent.ExecutionException: java.lang.RuntimeException: java.lang.reflect.InvocationTargetException
	at java.util.concurrent.FutureTask.report(FutureTask.java:122)
	at java.util.concurrent.FutureTask.get(FutureTask.java:192)
	at is.hail.JVMEntryway.retrieveException(JVMEntryway.java:253)
	at is.hail.JVMEntryway.finishFutures(JVMEntryway.java:215)
	at is.hail.JVMEntryway.main(JVMEntryway.java:185)
Caused by: java.lang.RuntimeException: java.lang.reflect.InvocationTargetException
	at is.hail.JVMEntryway$1.run(JVMEntryway.java:122)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.reflect.InvocationTargetException
	at sun.reflect.GeneratedMethodAccessor23.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at is.hail.JVMEntryway$1.run(JVMEntryway.java:119)
	... 7 more
Caused by: is.hail.backend.service.EndOfInputException
	at is.hail.backend.service.ServiceBackendSocketAPI2.read(ServiceBackend.scala:497)
	at is.hail.backend.service.ServiceBackendSocketAPI2.readInt(ServiceBackend.scala:510)
	at is.hail.backend.service.ServiceBackendSocketAPI2.executeOneCommand(ServiceBackend.scala:561)
	at is.hail.backend.service.ServiceBackendSocketAPI2$.$anonfun$main$6(ServiceBackend.scala:462)
	at is.hail.backend.service.ServiceBackendSocketAPI2$.$anonfun$main$6$adapted(ServiceBackend.scala:461)
	at is.hail.utils.package$.using(package.scala:635)
	at is.hail.backend.service.ServiceBackendSocketAPI2$.$anonfun$main$5(ServiceBackend.scala:461)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at is.hail.services.package$.retryTransientErrors(package.scala:141)
	at is.hail.backend.service.ServiceBackendSocketAPI2$.$anonfun$main$4(ServiceBackend.scala:460)
	at is.hail.backend.service.ServiceBackendSocketAPI2$.$anonfun$main$4$adapted(ServiceBackend.scala:459)
	at is.hail.utils.package$.using(package.scala:635)
	at is.hail.backend.service.ServiceBackendSocketAPI2$.$anonfun$main$3(ServiceBackend.scala:459)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at is.hail.services.package$.retryTransientErrors(package.scala:141)
	at is.hail.backend.service.ServiceBackendSocketAPI2$.main(ServiceBackend.scala:458)
	at is.hail.backend.service.Main$.main(Main.scala:15)
	at is.hail.backend.service.Main.main(Main.scala)
	... 11 more
```

Which suggests that the service backend experienced an EOF somewhere in
the first four bytes of the input file. Unfortunately, we automatically
cleanup the input and output files, so I can't investigate further. This
PR reads the input and output files and stores them in the error message
so that next time this happens we get more information.
  • Loading branch information
danking authored Jun 9, 2023
1 parent 60bf091 commit 39dc3c0
Showing 1 changed file with 25 additions and 55 deletions.
80 changes: 25 additions & 55 deletions hail/python/hail/backend/service_backend.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from typing import Dict, Optional, Callable, Awaitable, Mapping, Any, List, Union, Tuple, TypeVar, Set
import asyncio
import abc
import math
import struct
Expand Down Expand Up @@ -468,71 +467,42 @@ async def _rpc(self,
raise

with timings.step("read output"):
result_bytes = await self._resiliently_read_output(ir, iodir + '/out')
result_bytes = await retry_transient_errors(self._read_output, ir, iodir + '/out', iodir + '/in')
return token, result_bytes, timings

async def _resiliently_read_output(self, ir: Optional[BaseIR], output_uri: str) -> bytes:
# Azure appears to return EOF for valid reads. We suspect this is due to load shedding.
#
# /usr/local/lib/python3.8/dist-packages/hail/backend/service_backend.py:528: in _async_execute
# _, resp, timings = await self._rpc(
# /usr/local/lib/python3.8/dist-packages/hail/backend/service_backend.py:469: in _rpc
# result_bytes = await retry_transient_errors(self._read_output, ir, iodir + '/out')
# /usr/local/lib/python3.8/dist-packages/hailtop/utils/utils.py:780: in retry_transient_errors
# return await retry_transient_errors_with_debug_string('', 0, f, *args, **kwargs)
# /usr/local/lib/python3.8/dist-packages/hailtop/utils/utils.py:793: in retry_transient_errors_with_debug_string
# return await f(*args, **kwargs)
# /usr/local/lib/python3.8/dist-packages/hail/backend/service_backend.py:484: in _read_output
# success = await read_bool(outfile)
# /usr/local/lib/python3.8/dist-packages/hail/backend/service_backend.py:87: in read_bool
# return (await read_byte(strm)) != 0
# /usr/local/lib/python3.8/dist-packages/hail/backend/service_backend.py:83: in read_byte
# return (await strm.readexactly(1))[0]
# _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
#
# self = <hailtop.aiocloud.aioazure.fs.AzureReadableStream object at 0x7fe2083a5b80>
# n = 1
#
# async def readexactly(self, n: int) -> bytes:
# assert not self._closed and n >= 0
# data = await self.read(n)
# if len(data) != n:
# > raise UnexpectedEOFError()
# E hailtop.aiotools.fs.exceptions.UnexpectedEOFError
eofs_encountered = 0
while True:
try:
return await retry_transient_errors(self._read_output, ir, output_uri)
except UnexpectedEOFError as exc:
eofs_encountered += 1
if eofs_encountered == 2:
raise exc
await asyncio.sleep(1)

async def _read_output(self, ir: Optional[BaseIR], output_uri: str) -> bytes:
async def _read_output(self, ir: Optional[BaseIR], output_uri: str, input_uri: str) -> bytes:
assert self._batch

try:
driver_output = await self._async_fs.open(output_uri)
except FileNotFoundError as exc:
raise FatalError('Hail internal error. Please contact the Hail team and provide the following information.\n\n' + yamlx.dump({
'service_backend_debug_info': self.debug_info(),
'batch_debug_info': await self._batch.debug_info()
'batch_debug_info': await self._batch.debug_info(),
'input_uri': await self._async_fs.read(input_uri),
})) from exc

async with driver_output as outfile:
success = await read_bool(outfile)
if success:
return await read_bytes(outfile)

short_message = await read_str(outfile)
expanded_message = await read_str(outfile)
error_id = await read_int(outfile)

reconstructed_error = fatal_error_from_java_error_triplet(short_message, expanded_message, error_id)
if ir is None:
raise reconstructed_error
raise reconstructed_error.maybe_user_error(ir)
try:
async with driver_output as outfile:
success = await read_bool(outfile)
if success:
return await read_bytes(outfile)

short_message = await read_str(outfile)
expanded_message = await read_str(outfile)
error_id = await read_int(outfile)

reconstructed_error = fatal_error_from_java_error_triplet(short_message, expanded_message, error_id)
if ir is None:
raise reconstructed_error
raise reconstructed_error.maybe_user_error(ir)
except UnexpectedEOFError as exc:
raise FatalError('Hail internal error. Please contact the Hail team and provide the following information.\n\n' + yamlx.dump({
'service_backend_debug_info': self.debug_info(),
'batch_debug_info': await self._batch.debug_info(),
'in': await self._async_fs.read(input_uri),
'out': await self._async_fs.read(output_uri),
})) from exc

def _cancel_on_ctrl_c(self, coro: Awaitable[T]) -> T:
try:
Expand Down

0 comments on commit 39dc3c0

Please sign in to comment.