Skip to content

Commit dbe2d61

Browse files
authored
HADOOP-19096. [ABFS] [CST Optimization] Enhance Client-Side Throttling Metrics Logic (#6276)
ABFS has a client-side throttling mechanism which works on the metrics collected from past requests When requests are fail due to server-side throttling it updates its metrics and recalculates any client side backoff. The choice of which requests should be used to compute client side backoff interval is based on the http status code: - Status code in 2xx range: Successful Operations should contribute. - Status code in 3xx range: Redirection Operations should not contribute. - Status code in 4xx range: User Errors should not contribute. - Status code is 503: Throttling Error should contribute only if they are due to client limits breach as follows: * 503, Ingress Over Account Limit: Should Contribute * 503, Egress Over Account Limit: Should Contribute * 503, TPS Over Account Limit: Should Contribute * 503, Other Server Throttling: Should not Contribute. - Status code in 5xx range other than 503: Should not Contribute. - IOException and UnknownHostExceptions: Should not Contribute. Contributed by Anuj Modi
1 parent 281e2d2 commit dbe2d61

File tree

8 files changed

+134
-77
lines changed

8 files changed

+134
-77
lines changed

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/contracts/services/AzureServiceErrorCode.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,14 @@ public enum AzureServiceErrorCode {
4242
INVALID_SOURCE_OR_DESTINATION_RESOURCE_TYPE("InvalidSourceOrDestinationResourceType", HttpURLConnection.HTTP_CONFLICT, null),
4343
RENAME_DESTINATION_PARENT_PATH_NOT_FOUND("RenameDestinationParentPathNotFound", HttpURLConnection.HTTP_NOT_FOUND, null),
4444
INVALID_RENAME_SOURCE_PATH("InvalidRenameSourcePath", HttpURLConnection.HTTP_CONFLICT, null),
45-
INGRESS_OVER_ACCOUNT_LIMIT(null, HttpURLConnection.HTTP_UNAVAILABLE, "Ingress is over the account limit."),
46-
EGRESS_OVER_ACCOUNT_LIMIT(null, HttpURLConnection.HTTP_UNAVAILABLE, "Egress is over the account limit."),
45+
INGRESS_OVER_ACCOUNT_LIMIT("ServerBusy", HttpURLConnection.HTTP_UNAVAILABLE,
46+
"Ingress is over the account limit."),
47+
EGRESS_OVER_ACCOUNT_LIMIT("ServerBusy", HttpURLConnection.HTTP_UNAVAILABLE,
48+
"Egress is over the account limit."),
49+
TPS_OVER_ACCOUNT_LIMIT("ServerBusy", HttpURLConnection.HTTP_UNAVAILABLE,
50+
"Operations per second is over the account limit."),
51+
OTHER_SERVER_THROTTLING("ServerBusy", HttpURLConnection.HTTP_UNAVAILABLE,
52+
"The server is currently unable to receive requests. Please retry your request."),
4753
INVALID_QUERY_PARAMETER_VALUE("InvalidQueryParameterValue", HttpURLConnection.HTTP_BAD_REQUEST, null),
4854
AUTHORIZATION_PERMISSION_MISS_MATCH("AuthorizationPermissionMismatch", HttpURLConnection.HTTP_FORBIDDEN, null),
4955
ACCOUNT_REQUIRES_HTTPS("AccountRequiresHttps", HttpURLConnection.HTTP_BAD_REQUEST, null),

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsClient.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ private AbfsClient(final URL baseUrl,
167167
DelegatingSSLSocketFactory.initializeDefaultFactory(this.abfsConfiguration.getPreferredSSLFactoryOption());
168168
sslProviderName = DelegatingSSLSocketFactory.getDefaultFactory().getProviderName();
169169
} catch (IOException e) {
170-
// Suppress exception. Failure to init DelegatingSSLSocketFactory would have only performance impact.
170+
// Suppress exception, failure to init DelegatingSSLSocketFactory would have only performance impact.
171171
LOG.trace("NonCritFailure: DelegatingSSLSocketFactory Init failed : "
172172
+ "{}", e.getMessage());
173173
}

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsRestOperation.java

Lines changed: 58 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@
3939
import org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding;
4040

4141
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.HTTP_CONTINUE;
42-
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.CONNECTION_TIMEOUT_ABBREVIATION;
42+
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.EGRESS_LIMIT_BREACH_ABBREVIATION;
43+
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.INGRESS_LIMIT_BREACH_ABBREVIATION;
44+
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.TPS_LIMIT_BREACH_ABBREVIATION;
4345

4446
/**
4547
* The AbfsRestOperation for Rest AbfsClient.
@@ -283,7 +285,8 @@ String getClientLatency() {
283285
private boolean executeHttpOperation(final int retryCount,
284286
TracingContext tracingContext) throws AzureBlobFileSystemException {
285287
AbfsHttpOperation httpOperation;
286-
boolean wasIOExceptionThrown = false;
288+
// Used to avoid CST Metric Update in Case of UnknownHost/IO Exception.
289+
boolean wasKnownExceptionThrown = false;
287290

288291
try {
289292
// initialize the HTTP request and open the connection
@@ -321,7 +324,27 @@ private boolean executeHttpOperation(final int retryCount,
321324
} else if (httpOperation.getStatusCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
322325
incrementCounter(AbfsStatistic.SERVER_UNAVAILABLE, 1);
323326
}
327+
328+
// If no exception occurred till here it means http operation was successfully complete and
329+
// a response from server has been received which might be failure or success.
330+
// If any kind of exception has occurred it will be caught below.
331+
// If request failed to determine failure reason and retry policy here.
332+
// else simply return with success after saving the result.
333+
LOG.debug("HttpRequest: {}: {}", operationType, httpOperation);
334+
335+
int status = httpOperation.getStatusCode();
336+
failureReason = RetryReason.getAbbreviation(null, status, httpOperation.getStorageErrorMessage());
337+
retryPolicy = client.getRetryPolicy(failureReason);
338+
339+
if (retryPolicy.shouldRetry(retryCount, httpOperation.getStatusCode())) {
340+
return false;
341+
}
342+
343+
// If the request has succeeded or failed with non-retrial error, save the operation and return.
344+
result = httpOperation;
345+
324346
} catch (UnknownHostException ex) {
347+
wasKnownExceptionThrown = true;
325348
String hostname = null;
326349
hostname = httpOperation.getHost();
327350
failureReason = RetryReason.getAbbreviation(ex, null, null);
@@ -333,57 +356,27 @@ private boolean executeHttpOperation(final int retryCount,
333356
}
334357
return false;
335358
} catch (IOException ex) {
359+
wasKnownExceptionThrown = true;
336360
if (LOG.isDebugEnabled()) {
337361
LOG.debug("HttpRequestFailure: {}, {}", httpOperation, ex);
338362
}
339363

340364
failureReason = RetryReason.getAbbreviation(ex, -1, "");
341365
retryPolicy = client.getRetryPolicy(failureReason);
342-
wasIOExceptionThrown = true;
343366
if (!retryPolicy.shouldRetry(retryCount, -1)) {
344367
throw new InvalidAbfsRestOperationException(ex, retryCount);
345368
}
346369

347370
return false;
348371
} finally {
349-
int status = httpOperation.getStatusCode();
350-
/*
351-
A status less than 300 (2xx range) or greater than or equal
352-
to 500 (5xx range) should contribute to throttling metrics being updated.
353-
Less than 200 or greater than or equal to 500 show failed operations. 2xx
354-
range contributes to successful operations. 3xx range is for redirects
355-
and 4xx range is for user errors. These should not be a part of
356-
throttling backoff computation.
357-
*/
358-
boolean updateMetricsResponseCode = (status < HttpURLConnection.HTTP_MULT_CHOICE
359-
|| status >= HttpURLConnection.HTTP_INTERNAL_ERROR);
360-
361-
/*
362-
Connection Timeout failures should not contribute to throttling
363-
In case the current request fails with Connection Timeout we will have
364-
ioExceptionThrown true and failure reason as CT
365-
In case the current request failed with 5xx, failure reason will be
366-
updated after finally block but wasIOExceptionThrown will be false;
367-
*/
368-
boolean isCTFailure = CONNECTION_TIMEOUT_ABBREVIATION.equals(failureReason) && wasIOExceptionThrown;
369-
370-
if (updateMetricsResponseCode && !isCTFailure) {
372+
int statusCode = httpOperation.getStatusCode();
373+
// Update Metrics only if Succeeded or Throttled due to account limits.
374+
// Also Update in case of any unhandled exception is thrown.
375+
if (shouldUpdateCSTMetrics(statusCode) && !wasKnownExceptionThrown) {
371376
intercept.updateMetrics(operationType, httpOperation);
372377
}
373378
}
374379

375-
LOG.debug("HttpRequest: {}: {}", operationType, httpOperation);
376-
377-
int status = httpOperation.getStatusCode();
378-
failureReason = RetryReason.getAbbreviation(null, status, httpOperation.getStorageErrorMessage());
379-
retryPolicy = client.getRetryPolicy(failureReason);
380-
381-
if (retryPolicy.shouldRetry(retryCount, httpOperation.getStatusCode())) {
382-
return false;
383-
}
384-
385-
result = httpOperation;
386-
387380
return true;
388381
}
389382

@@ -443,6 +436,34 @@ private void incrementCounter(AbfsStatistic statistic, long value) {
443436
}
444437
}
445438

439+
/**
440+
* Updating Client Side Throttling Metrics for relevant response status codes.
441+
* Following criteria is used to decide based on status code and failure reason.
442+
* <ol>
443+
* <li>Case 1: Status code in 2xx range: Successful Operations should contribute</li>
444+
* <li>Case 2: Status code in 3xx range: Redirection Operations should not contribute</li>
445+
* <li>Case 3: Status code in 4xx range: User Errors should not contribute</li>
446+
* <li>
447+
* Case 4: Status code is 503: Throttling Error should contribute as following:
448+
* <ol>
449+
* <li>Case 4.a: Ingress Over Account Limit: Should Contribute</li>
450+
* <li>Case 4.b: Egress Over Account Limit: Should Contribute</li>
451+
* <li>Case 4.c: TPS Over Account Limit: Should Contribute</li>
452+
* <li>Case 4.d: Other Server Throttling: Should not contribute</li>
453+
* </ol>
454+
* </li>
455+
* <li>Case 5: Status code in 5xx range other than 503: Should not contribute</li>
456+
* </ol>
457+
* @param statusCode
458+
* @return
459+
*/
460+
private boolean shouldUpdateCSTMetrics(final int statusCode) {
461+
return statusCode < HttpURLConnection.HTTP_MULT_CHOICE // Case 1
462+
|| INGRESS_LIMIT_BREACH_ABBREVIATION.equals(failureReason) // Case 4.a
463+
|| EGRESS_LIMIT_BREACH_ABBREVIATION.equals(failureReason) // Case 4.b
464+
|| TPS_LIMIT_BREACH_ABBREVIATION.equals(failureReason); // Case 4.c
465+
}
466+
446467
/**
447468
* Creates a new Tracing context before entering the retry loop of a rest operation.
448469
* This will ensure all rest operations have unique

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/RetryReasonConstants.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,13 @@ private RetryReasonConstants() {
2626
public static final String CONNECTION_TIMEOUT_JDK_MESSAGE = "connect timed out";
2727
public static final String READ_TIMEOUT_JDK_MESSAGE = "Read timed out";
2828
public static final String CONNECTION_RESET_MESSAGE = "Connection reset";
29-
public static final String OPERATION_BREACH_MESSAGE = "Operations per second is over the account limit.";
3029
public static final String CONNECTION_RESET_ABBREVIATION = "CR";
3130
public static final String CONNECTION_TIMEOUT_ABBREVIATION = "CT";
3231
public static final String READ_TIMEOUT_ABBREVIATION = "RT";
3332
public static final String INGRESS_LIMIT_BREACH_ABBREVIATION = "ING";
3433
public static final String EGRESS_LIMIT_BREACH_ABBREVIATION = "EGR";
35-
public static final String OPERATION_LIMIT_BREACH_ABBREVIATION = "OPR";
34+
public static final String TPS_LIMIT_BREACH_ABBREVIATION = "OPR";
35+
public static final String OTHER_SERVER_THROTTLING_ABBREVIATION = "OTH";
3636
public static final String UNKNOWN_HOST_EXCEPTION_ABBREVIATION = "UH";
3737
public static final String IO_EXCEPTION_ABBREVIATION = "IOE";
3838
public static final String SOCKET_EXCEPTION_ABBREVIATION = "SE";

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/retryReasonCategories/ServerErrorRetryReason.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@
2222
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.HTTP_STATUS_CATEGORY_QUOTIENT;
2323
import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.EGRESS_OVER_ACCOUNT_LIMIT;
2424
import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.INGRESS_OVER_ACCOUNT_LIMIT;
25+
import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.OTHER_SERVER_THROTTLING;
26+
import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.TPS_OVER_ACCOUNT_LIMIT;
2527
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.EGRESS_LIMIT_BREACH_ABBREVIATION;
2628
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.INGRESS_LIMIT_BREACH_ABBREVIATION;
27-
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.OPERATION_BREACH_MESSAGE;
28-
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.OPERATION_LIMIT_BREACH_ABBREVIATION;
29+
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.TPS_LIMIT_BREACH_ABBREVIATION;
30+
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.OTHER_SERVER_THROTTLING_ABBREVIATION;
2931

3032
/**
3133
* Category that can capture server-response errors for 5XX status-code.
@@ -56,9 +58,13 @@ String getAbbreviation(final Integer statusCode,
5658
splitedServerErrorMessage)) {
5759
return EGRESS_LIMIT_BREACH_ABBREVIATION;
5860
}
59-
if (OPERATION_BREACH_MESSAGE.equalsIgnoreCase(
61+
if (TPS_OVER_ACCOUNT_LIMIT.getErrorMessage().equalsIgnoreCase(
6062
splitedServerErrorMessage)) {
61-
return OPERATION_LIMIT_BREACH_ABBREVIATION;
63+
return TPS_LIMIT_BREACH_ABBREVIATION;
64+
}
65+
if (OTHER_SERVER_THROTTLING.getErrorMessage().equalsIgnoreCase(
66+
splitedServerErrorMessage)) {
67+
return OTHER_SERVER_THROTTLING_ABBREVIATION;
6268
}
6369
return HTTP_UNAVAILABLE + "";
6470
}

hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/services/ITestAbfsRestOperation.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import static org.apache.hadoop.fs.azurebfs.constants.HttpQueryParams.QUERY_PARAM_POSITION;
6262
import static org.apache.hadoop.fs.azurebfs.constants.TestConfigurationKeys.FS_AZURE_ABFS_ACCOUNT_NAME;
6363
import static org.apache.hadoop.fs.azurebfs.constants.TestConfigurationKeys.TEST_CONFIGURATION_FILE_NAME;
64+
import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.EGRESS_OVER_ACCOUNT_LIMIT;
6465
import static org.apache.hadoop.test.LambdaTestUtils.intercept;
6566
import static org.mockito.Mockito.mock;
6667
import static org.mockito.Mockito.never;
@@ -233,6 +234,11 @@ private AbfsRestOperation getRestOperation() throws Exception {
233234
// mocked the response code and the response message to check different
234235
// behaviour based on response code.
235236
Mockito.doReturn(responseCode).when(abfsHttpOperation).getConnResponseCode();
237+
if (responseCode == HTTP_UNAVAILABLE) {
238+
Mockito.doReturn(EGRESS_OVER_ACCOUNT_LIMIT.getErrorMessage())
239+
.when(abfsHttpOperation)
240+
.getStorageErrorMessage();
241+
}
236242
Mockito.doReturn(responseMessage)
237243
.when(abfsHttpOperation)
238244
.getConnResponseMessage();

0 commit comments

Comments
 (0)