Skip to content

Commit

Permalink
Never create more than one process per WorkerMultiplexer.
Browse files Browse the repository at this point in the history
Turns out the ability to re-create a process makes everything complicated. Instead, just let the WorkerMultiplexer instance fall to the floor and create a new one as needed.

Also restores interrupts in more places, handles some non-io-exceptions better in WorkerSpawnRunner, checks a few more edge cases around the multiplexer,
makes the multiplexer try not to get interrupted during actual read, avoids creating unnecessary WorkerMultiplexer garbage, removes shutdownhooks on workerproxy
destruction, sets the multiplexer reporter earlier, and improves some error messages.

RELNOTES: n/a
PiperOrigin-RevId: 351606949
  • Loading branch information
larsrc-google authored and copybara-github committed Jan 13, 2021
1 parent f431b0c commit a607d9d
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 141 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,9 @@ String getRecordingStreamMessage() {
recordingInputStream.readRemaining();
return recordingInputStream.getRecordedDataAsString();
}

@Override
public String toString() {
return workerKey.getMnemonic() + " worker #" + workerId;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public void setOptions(WorkerOptions workerOptions) {
}

@Override
public Worker create(WorkerKey key) throws Exception {
public Worker create(WorkerKey key) {
int workerId = pidCounter.getAndIncrement();
String workTypeName = WorkerKey.makeWorkerTypeName(key.getProxied());
Path logFile =
Expand All @@ -66,9 +66,7 @@ public Worker create(WorkerKey key) throws Exception {
worker = new SandboxedWorker(key, workerId, workDir, logFile);
} else if (key.getProxied()) {
WorkerMultiplexer workerMultiplexer = WorkerMultiplexerManager.getInstance(key, logFile);
worker =
new WorkerProxy(
key, workerId, key.getExecRoot(), workerMultiplexer.getLogFile(), workerMultiplexer);
worker = new WorkerProxy(key, workerId, workerMultiplexer.getLogFile(), workerMultiplexer);
} else {
worker = new SingleplexWorker(key, workerId, key.getExecRoot(), logFile);
}
Expand Down Expand Up @@ -112,13 +110,12 @@ public PooledObject<Worker> wrap(Worker worker) {
@Override
public void destroyObject(WorkerKey key, PooledObject<Worker> p) throws Exception {
if (workerOptions.workerVerbose) {
int workerId = p.getObject().getWorkerId();
reporter.handle(
Event.info(
String.format(
"Destroying %s %s (id %d)",
key.getMnemonic(),
WorkerKey.makeWorkerTypeName(key.getProxied()),
p.getObject().getWorkerId())));
key.getMnemonic(), WorkerKey.makeWorkerTypeName(key.getProxied()), workerId)));
}
p.getObject().destroy();
}
Expand Down Expand Up @@ -161,10 +158,10 @@ public boolean validateObject(WorkerKey key, PooledObject<Worker> p) {
}
return false;
}
boolean hashMatches =
key.getWorkerFilesCombinedHash().equals(worker.getWorkerFilesCombinedHash());
boolean filesChanged =
!key.getWorkerFilesCombinedHash().equals(worker.getWorkerFilesCombinedHash());

if (workerOptions.workerVerbose && reporter != null && !hashMatches) {
if (workerOptions.workerVerbose && reporter != null && filesChanged) {
StringBuilder msg = new StringBuilder();
msg.append(
String.format(
Expand All @@ -191,6 +188,6 @@ public boolean validateObject(WorkerKey key, PooledObject<Worker> p) {
reporter.handle(Event.warn(msg.toString()));
}

return hashMatches;
return !filesChanged;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ public Iterable<Class<? extends OptionsBase>> getCommandOptions(Command command)
public void beforeCommand(CommandEnvironment env) {
this.env = env;
env.getEventBus().register(this);
WorkerMultiplexerManager.beforeCommand(env);
}

@Subscribe
Expand Down Expand Up @@ -236,6 +237,6 @@ public void afterCommand() {
if (this.workerFactory != null) {
this.workerFactory.setReporter(null);
}
WorkerMultiplexerManager.afterCommandCleanup();
WorkerMultiplexerManager.afterCommand();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Semaphore;
import javax.annotation.Nullable;

/**
* An intermediate worker that sends requests and receives responses from the worker processes.
Expand All @@ -48,29 +49,23 @@ public class WorkerMultiplexer extends Thread {
* A map of {@code WorkResponse}s received from the worker process. They are stored in this map
* keyed by the request id until the corresponding {@code WorkerProxy} picks them up.
*/
private final ConcurrentMap<Integer, WorkResponse> workerProcessResponse;
private final ConcurrentMap<Integer, WorkResponse> workerProcessResponse =
new ConcurrentHashMap<>();
/**
* A map of semaphores corresponding to {@code WorkRequest}s. After sending the {@code
* WorkRequest}, {@code WorkerProxy} will wait on a semaphore to be released. {@code
* WorkerMultiplexer} is responsible for releasing the corresponding semaphore in order to signal
* {@code WorkerProxy} that the {@code WorkerResponse} has been received.
*/
private final ConcurrentMap<Integer, Semaphore> responseChecker;
/** The worker process that this WorkerMultiplexer should be talking to. */
private Subprocess process;
private final ConcurrentMap<Integer, Semaphore> responseChecker = new ConcurrentHashMap<>();
/**
* Set to true if one of the worker processes returns an unparseable response, or for other
* reasons we can't properly handle the remaining responses. We then discard all the responses
* from other work requests and abort.
* The worker process that this WorkerMultiplexer should be talking to. This should only be set
* once, when creating a new process. If the process dies or its stdio streams get corrupted, the
* {@code WorkerMultiplexer} gets discarded as well and a new one gets created as needed.
*/
private boolean isWorkerStreamCorrupted;
private Subprocess process;
/** InputStream from the worker process. */
private RecordingInputStream recordingStream;
/**
* True if we have received EOF on the stream from the worker process. We then stop processing,
* and all workers still waiting for responses will fail.
*/
private boolean isWorkerStreamClosed;
/** True if this multiplexer was explicitly destroyed. */
private boolean wasDestroyed;
/**
Expand All @@ -89,25 +84,20 @@ public class WorkerMultiplexer extends Thread {
* The active Reporter object, non-null if {@code --worker_verbose} is set. This must be cleared
* at the end of a command execution.
*/
public EventHandler reporter;
private EventHandler reporter;

WorkerMultiplexer(Path logFile, WorkerKey workerKey) {
this.logFile = logFile;
this.workerKey = workerKey;
responseChecker = new ConcurrentHashMap<>();
workerProcessResponse = new ConcurrentHashMap<>();
isWorkerStreamCorrupted = false;
isWorkerStreamClosed = false;
wasDestroyed = false;
}

/** Sets or clears the reporter for outputting verbose info. */
void setReporter(EventHandler reporter) {
synchronized void setReporter(@Nullable EventHandler reporter) {
this.reporter = reporter;
}

/** Reports a string to the user if reporting is enabled. */
private void report(String s) {
private synchronized void report(String s) {
EventHandler r = this.reporter; // Protect against race condition with setReporter().
if (r != null && s != null) {
r.handle(Event.info(s));
Expand All @@ -119,17 +109,17 @@ private void report(String s) {
* exist. Also makes sure this {@code WorkerMultiplexer} runs as a separate thread.
*/
public synchronized void createProcess(Path workDir) throws IOException {
// The process may have died in the meanwhile (e.g. between builds).
if (this.process == null || !this.process.isAlive()) {
if (this.process == null) {
if (this.wasDestroyed) {
throw new IOException("Multiplexer destroyed before created process");
}
ImmutableList<String> args = workerKey.getArgs();
File executable = new File(args.get(0));
if (!executable.isAbsolute() && executable.getParent() != null) {
List<String> newArgs = new ArrayList<>(args);
newArgs.set(0, new File(workDir.getPathFile(), newArgs.get(0)).getAbsolutePath());
args = ImmutableList.copyOf(newArgs);
}
isWorkerStreamCorrupted = false;
isWorkerStreamClosed = false;
SubprocessBuilder processBuilder =
subprocessFactory != null
? new SubprocessBuilder(subprocessFactory)
Expand All @@ -139,6 +129,8 @@ public synchronized void createProcess(Path workDir) throws IOException {
processBuilder.setStderr(logFile.getPathFile());
processBuilder.setEnv(workerKey.getEnv());
this.process = processBuilder.start();
} else if (!this.process.isAlive()) {
throw new IOException("Process is dead");
}
if (!this.isAlive()) {
this.start();
Expand All @@ -155,24 +147,24 @@ public Path getLogFile() {

/**
* Signals this object to destroy itself, including the worker process. The object might not be
* fully destroyed at the end of this call, but will terminate soon.
* fully destroyed at the end of this call, but will terminate soon. This is considered a
* deliberate destruction.
*/
public synchronized void destroyMultiplexer() {
if (this.process != null) {
destroyProcess(this.process);
this.process = null;
destroyProcess();
}
wasDestroyed = true;
}

/** Destroys the worker subprocess. This might block forever if the subprocess refuses to die. */
private void destroyProcess(Subprocess process) {
private synchronized void destroyProcess() {
boolean wasInterrupted = false;
try {
process.destroy();
this.process.destroy();
while (true) {
try {
process.waitFor();
this.process.waitFor();
return;
} catch (InterruptedException ie) {
wasInterrupted = true;
Expand All @@ -183,7 +175,6 @@ private void destroyProcess(Subprocess process) {
if (wasInterrupted) {
Thread.currentThread().interrupt(); // preserve interrupted status
}
isWorkerStreamClosed = true;
}
}

Expand All @@ -200,10 +191,6 @@ public synchronized void putRequest(WorkRequest request) throws IOException {
// We can't know how much of the request was sent, so we have to assume the worker's input
// now contains garbage.
// TODO(b/151767359): Avoid causing garbage! Maybe by sending in a separate thread?
isWorkerStreamCorrupted = true;
if (e instanceof InterruptedIOException) {
Thread.currentThread().interrupt();
}
responseChecker.remove(request.getRequestId());
throw e;
}
Expand All @@ -228,10 +215,8 @@ public WorkResponse getResponse(Integer requestId) throws InterruptedException {
// Wait for the multiplexer to get our response and release this semaphore. The semaphore will
// throw {@code InterruptedException} when the multiplexer is terminated.
waitForResponse.acquire();
report("Acquired response semaphore for " + requestId);

WorkResponse workResponse = workerProcessResponse.get(requestId);
report("Response for " + requestId + " is " + workResponse);
return workResponse;
} finally {
responseChecker.remove(requestId);
Expand All @@ -247,25 +232,25 @@ public WorkResponse getResponse(Integer requestId) throws InterruptedException {
* execution cancellation.
*/
private void waitResponse() throws InterruptedException, IOException {
Subprocess p = this.process;
if (p == null || !p.isAlive()) {
// Avoid busy-wait for a new process.
recordingStream = new RecordingInputStream(this.process.getInputStream());
recordingStream.startRecording(4096);
// TODO(larsrc): Turn this into a loop that also sends requests.
// Allow interrupts while waiting for responses, without conflating it with I/O errors.
while (recordingStream.available() == 0) {
if (!this.process.isAlive()) {
throw new IOException(
String.format("Multiplexer process for %s is dead", workerKey.getMnemonic()));
}
Thread.sleep(1);
return;
}
recordingStream = new RecordingInputStream(p.getInputStream());
recordingStream.startRecording(4096);
WorkResponse parsedResponse = WorkResponse.parseDelimitedFrom(recordingStream);

// A null parsedResponse can only happen if the input stream is closed, in which case we
// drop everything.
if (parsedResponse == null) {
isWorkerStreamClosed = true;
report(
throw new IOException(
String.format(
"Multiplexer process for %s has closed its output, aborting multiplexer",
workerKey.getMnemonic()));
return;
"Multiplexer process for %s died while reading response", workerKey.getMnemonic()));
}

int requestId = parsedResponse.getRequestId();
Expand All @@ -287,13 +272,15 @@ private void waitResponse() throws InterruptedException, IOException {
/** The multiplexer thread that listens to the WorkResponse from worker process. */
@Override
public void run() {
while (!isWorkerStreamClosed && !isWorkerStreamCorrupted) {
while (this.process.isAlive()) {
try {
waitResponse();
} catch (IOException e) {
// We got this exception while reading from the worker's stdout. We can't trust the
// output any more at that point.
isWorkerStreamCorrupted = true;
if (this.process.isAlive()) {
destroyProcess();
}
if (e instanceof InterruptedIOException) {
report(
String.format(
Expand All @@ -315,17 +302,12 @@ public void run() {
// will let fall on the floor, but we still want to leave the process running for the next
// build.
// TODO(b/151767359): Cancel all outstanding requests when cancellation is implemented.
releaseAllSemaphores();
for (Semaphore semaphore : responseChecker.values()) {
semaphore.release();
}
}
}
// If we get here, the worker process is either dead or corrupted. We could attempt to restart
// it, but the outstanding requests will have failed already. Until we have a way to signal
// transient failures, we have to just reject all further requests and make sure the process
// is really dead
synchronized (this) {
if (process != null && process.isAlive()) {
destroyMultiplexer();
}
releaseAllSemaphores();
}
}
Expand All @@ -350,14 +332,14 @@ String getRecordingStreamMessage() {

/** Returns true if this process has died for other reasons than a call to {@code #destroy()}. */
boolean diedUnexpectedly() {
Subprocess p = this.process; // Protects against this.process getting null.
return p != null && !p.isAlive() && !wasDestroyed;
return this.process != null && !this.process.isAlive() && !wasDestroyed;
}

/** Returns the exit value of multiplexer's process, if it has exited. */
Optional<Integer> getExitValue() {
Subprocess p = this.process; // Protects against this.process getting null.
return p != null && !p.isAlive() ? Optional.of(p.exitValue()) : Optional.empty();
return this.process != null && !this.process.isAlive()
? Optional.of(this.process.exitValue())
: Optional.empty();
}

/** For testing only, to verify that maps are cleared after responses are reaped. */
Expand Down
Loading

0 comments on commit a607d9d

Please sign in to comment.