Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve error messages for network connectivity and memory issue #1483

Merged
merged 20 commits into from
Feb 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ dependencies {
testCompile group: 'junit', name: 'junit', version: '4.12'
testCompile group: 'com.google.code.gson', name: 'gson', version: '2.8.2'
testCompile group: 'com.github.stefanbirkner', name: 'system-rules', version: '1.19.0'
testCompile group: 'io.github.hakky54', name: 'logcaptor', version: '2.3.1'

forTheDLL 'com.microsoft.azure:applicationinsights-core:2.6.2'
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import java.util.concurrent.atomic.AtomicLong;

import com.microsoft.applicationinsights.internal.channel.TransmissionOutputSync;
import com.microsoft.applicationinsights.internal.util.ExceptionStats;
import com.microsoft.applicationinsights.internal.util.LimitsEnforcer;
import com.microsoft.applicationinsights.internal.util.LocalFileSystemUtils;
import org.apache.commons.io.FileUtils;
Expand Down Expand Up @@ -84,7 +85,9 @@ public final class TransmissionFileSystemOutput implements TransmissionOutputSyn
private final static int MAX_CAPACITY_MEGABYTES = 1000;
private final static int MIN_CAPACITY_MEGABYTES = 1;
private static final String MAX_TRANSMISSION_STORAGE_CAPACITY_NAME = "Channel.MaxTransmissionStorageCapacityInMB";

private static final ExceptionStats diskExceptionStats = new ExceptionStats(
TransmissionFileSystemOutput.class,
"Unable to store telemetry to disk (telemetry will be discarded):");

/// The folder in which we save transmission files
private File folder;
Expand Down Expand Up @@ -140,10 +143,7 @@ public boolean sendSync(Transmission transmission) {

long currentSizeInBytes = size.get();
if (currentSizeInBytes >= capacityInBytes) {
logger.error("Persistent storage max capacity has been reached; "
+ "currently at {} bytes. Telemetry will be lost, "
+ "please consider increasing the value of MaxTransmissionStorageFilesCapacityInMB property in the configuration file.",
currentSizeInBytes);
diskExceptionStats.recordFailure("local storage capacity (" + capacityInBytes / (1024 * 1024) + "MB) has been exceeded");
return false;
}

Expand All @@ -161,6 +161,7 @@ public boolean sendSync(Transmission transmission) {
}

logger.debug("Data persisted to file. To be sent when the network is available.");
diskExceptionStats.recordSuccess();
return true;
}

Expand Down Expand Up @@ -278,7 +279,7 @@ private boolean renameToPermanentName(File tempTransmissionFile) {
size.addAndGet(fileLength);
return true;
} catch (Exception e) {
logger.error("Rename To Permanent Name failed, exception: {}", e.toString());
diskExceptionStats.recordFailure("unable to rename file to permanent name: " + e, e);
}

return false;
Expand All @@ -305,7 +306,7 @@ private boolean saveTransmission(File transmissionFile, Transmission transmissio
output.writeObject(transmission);
return true;
} catch (IOException e) {
logger.error("Failed to save transmission, exception: {}", e.toString());
diskExceptionStats.recordFailure("unable to write to file: " + e, e);
}

return false;
Expand All @@ -317,7 +318,7 @@ private Optional<File> createTemporaryFile() {
String prefix = TRANSMISSION_FILE_PREFIX + "-" + System.currentTimeMillis() + "-";
file = File.createTempFile(prefix, null, folder);
} catch (IOException e) {
logger.error("Failed to create temporary file, exception: {}", e.toString());
diskExceptionStats.recordFailure("unable to create temporary file: " + e, e);
}

return Optional.fromNullable(file);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,20 @@

package com.microsoft.applicationinsights.internal.channel.common;

import java.io.IOException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.annotation.Nullable;

import com.google.common.base.Preconditions;
import com.microsoft.applicationinsights.TelemetryConfiguration;
import com.microsoft.applicationinsights.customExceptions.FriendlyException;
import com.microsoft.applicationinsights.internal.channel.TransmissionDispatcher;
import com.microsoft.applicationinsights.internal.channel.TransmissionHandlerArgs;
import com.microsoft.applicationinsights.internal.channel.TransmissionOutputSync;
import com.microsoft.applicationinsights.internal.util.ExceptionStats;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
Expand All @@ -39,12 +47,6 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;
import java.io.IOException;
import java.net.SocketException;
import java.net.UnknownHostException;
import java.util.concurrent.atomic.AtomicBoolean;

/**
* The class is responsible for the actual sending of
* {@link com.microsoft.applicationinsights.internal.channel.common.Transmission}
Expand All @@ -56,7 +58,10 @@
public final class TransmissionNetworkOutput implements TransmissionOutputSync {

private static final Logger logger = LoggerFactory.getLogger(TransmissionNetworkOutput.class);
private static volatile AtomicBoolean friendlyExceptionThrown = new AtomicBoolean();
private static final AtomicBoolean friendlyExceptionThrown = new AtomicBoolean();
private static final ExceptionStats networkExceptionStats = new ExceptionStats(
TransmissionNetworkOutput.class,
"Unable to send telemetry to the ingestion service (telemetry will be stored to disk):");

private static final String CONTENT_TYPE_HEADER = "Content-Type";
private static final String CONTENT_ENCODING_HEADER = "Content-Encoding";
Expand Down Expand Up @@ -173,35 +178,34 @@ public boolean sendSync(Transmission transmission) {
// If we've completed then clear the back off flags as the channel does not need
// to be throttled
transmissionPolicyManager.clearBackoff();
// Increment Success Counter
networkExceptionStats.recordSuccess();
}
return true;

} catch (ConnectionPoolTimeoutException e) {
ex = e;
trask marked this conversation as resolved.
Show resolved Hide resolved
logger.error("Failed to send, connection pool timeout exception", e);
networkExceptionStats.recordFailure("connection pool timeout exception: " + e, e);
} catch (SocketException e) {
ex = e;
logger.error("Failed to send, socket exception", e);
networkExceptionStats.recordFailure("socket exception: " + e, e);
} catch (SocketTimeoutException e) {
networkExceptionStats.recordFailure("socket timeout exception: " + e, e);
} catch (UnknownHostException e) {
ex = e;
logger.error("Failed to send, wrong host address or cannot reach address due to network issues", e);
} catch (IOException ioe) {
ex = ioe;
logger.error("Failed to send", ioe);
networkExceptionStats.recordFailure("wrong host address or cannot reach address due to network issues: " + e, e);
} catch (IOException e) {
networkExceptionStats.recordFailure("I/O exception: " + e, e);
} catch (FriendlyException e) {
ex = e;
// TODO should this be merged into networkExceptionStats?
if(!friendlyExceptionThrown.getAndSet(true)) {
logger.error(e.getMessage());
}
} catch (Exception e) {
ex = e;
logger.error("Failed to send, unexpected exception", e);
networkExceptionStats.recordFailure("unexpected exception: " + e, e);
} catch (ThreadDeath td) {
throw td;
} catch (Throwable t) {
ex = t;
try {
logger.error("Failed to send, unexpected error", t);
networkExceptionStats.recordFailure("unexpected exception: " + t, t);
} catch (ThreadDeath td) {
throw td;
} catch (Throwable t2) {
Expand All @@ -214,7 +218,7 @@ public boolean sendSync(Transmission transmission) {
httpClient.dispose(response);

if (code == HttpStatus.SC_BAD_REQUEST) {
logger.error("Error sending data: {}", reason);
networkExceptionStats.recordFailure("ingestion service returned 400 (" + reason + ")");
} else if (code != HttpStatus.SC_OK) {
// Invoke the listeners for handling things like errors
// The listeners will handle the back off logic as well as the dispatch
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
package com.microsoft.applicationinsights.internal.util;

import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import javax.annotation.Nullable;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

// exception stats for a given 5-min window
// each instance represents a logical grouping of errors that a user cares about and can understand,
// e.g. sending telemetry to the portal, storing telemetry to disk, ...
public class ExceptionStats {

private static final ScheduledExecutorService scheduledExecutor =
Executors.newSingleThreadScheduledExecutor(ThreadPoolUtils.createDaemonThreadFactory(ExceptionStats.class, "exception stats logger"));

private final Logger logger;
private final String introMessage;

// Period for scheduled executor in secs
private final int intervalSeconds;

private final AtomicBoolean firstFailure = new AtomicBoolean();

// private final String groupingMessage;

// number of successes and failures in the 5-min window
private long numSuccesses;
private long numFailures;

// using MutableLong for two purposes
// * so we don't need to get and set into map each time we want to increment
// * avoid autoboxing for values above 128
private Map<String, MutableLong> warningMessages = new HashMap<>();

private final Object lock = new Object();

public ExceptionStats(Class<?> source, String introMessage) {
this(source, introMessage, 300);
}

// Primarily used by test
public ExceptionStats(Class<?> source, String introMessage, int intervalSeconds) {
logger = LoggerFactory.getLogger(source);
this.introMessage = introMessage;
this.intervalSeconds = intervalSeconds;
}

public void recordSuccess() {
synchronized (lock) {
numSuccesses++;
}
}

// warningMessage should have low cardinality
public void recordFailure(String warningMessage) {
recordFailure(warningMessage, null);
}

// warningMessage should have low cardinality
public void recordFailure(String warningMessage, @Nullable Throwable exception) {
if (!firstFailure.getAndSet(true)) {
// log the first time we see an exception as soon as it occurs, along with full stack trace
logger.warn(introMessage + " " + warningMessage + " (future failures will be aggregated and logged once every " + intervalSeconds / 60 + " minutes)", exception);
scheduledExecutor.scheduleAtFixedRate(new ExceptionStatsLogger(), intervalSeconds, intervalSeconds, TimeUnit.SECONDS);
return;
}

logger.debug(introMessage + " " + warningMessage, exception);

if (warningMessages.size() > 100) {
// we have a cardinality problem and don't want to consume too much memory or do too much logging
return;
}
synchronized (lock) {
warningMessages.computeIfAbsent(warningMessage, key -> new MutableLong()).increment();
numFailures++;
}
}

private static class MutableLong {
private long value;
private void increment() {
value++;
}
}

public class ExceptionStatsLogger implements Runnable {

@Override
public void run() {
long numSuccesses;
long numFailures;
Map<String, MutableLong> warningMessages;
// grab quickly and reset under lock (do not perform logging under lock)
synchronized (lock) {
numSuccesses = ExceptionStats.this.numSuccesses;
numFailures = ExceptionStats.this.numFailures;
warningMessages = ExceptionStats.this.warningMessages;

ExceptionStats.this.numSuccesses = 0;
ExceptionStats.this.numFailures = 0;
ExceptionStats.this.warningMessages = new HashMap<>();
}
if (numFailures > 0) {
long numMinutes = ExceptionStats.this.intervalSeconds / 60;
long total = numSuccesses + numFailures;
StringBuilder message = new StringBuilder();
message.append("In the last ");
message.append(numMinutes);
message.append(" minutes, the following operation has failed ");
message.append(numFailures);
message.append(" times (out of ");
message.append(total);
message.append("):\n");
message.append(introMessage);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding introMessage again is adding too much clutter I think.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's good context, but I shortened the intro significantly to make the messages less clutterful

for (Map.Entry<String, MutableLong> entry : warningMessages.entrySet()) {
message.append("\n * ");
message.append(entry.getKey());
message.append(" (");
message.append(entry.getValue().value);
message.append(" times)");
}
logger.warn(message.toString());
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package com.microsoft.applicationinsights.internal.util;

import nl.altindag.log.LogCaptor;
import org.junit.*;

import static org.junit.Assert.*;

public class ExceptionStatsLoggerTest {
private static ExceptionStats networkExceptionStats;

@BeforeClass
public static void setUp() {
// one-time initialization code
networkExceptionStats = new ExceptionStats(ExceptionStatsLoggerTest.class, "intro:", 1);
}

@Test
public void testWarnAndExceptionsAreLogged() throws InterruptedException {
LogCaptor logCaptor = LogCaptor.forClass(ExceptionStatsLoggerTest.class);
networkExceptionStats.recordSuccess();
Exception ex=new IllegalArgumentException();
networkExceptionStats.recordFailure("Test Message",ex);
networkExceptionStats.recordFailure("Test Message2",ex);
networkExceptionStats.recordFailure("Test Message2",ex);
networkExceptionStats.recordFailure("Test Message3",ex);
//wait for more than 1 second
Thread.sleep(3000);
assertEquals(2,logCaptor.getWarnLogs().size());
assertTrue(logCaptor.getWarnLogs().get(0).contains("intro: Test Message (future failures will be aggregated and logged once every 0 minutes)"));
assertTrue(logCaptor.getWarnLogs().get(1).contains("In the last 0 minutes, the following operation has failed 3 times (out of 4):\n" +
"intro:\n" +
" * Test Message2 (2 times)\n" +
" * Test Message3 (1 times)"));
}
}