diff --git a/core/build.gradle b/core/build.gradle index 9889f39c242..50b9507cb50 100644 --- a/core/build.gradle +++ b/core/build.gradle @@ -93,6 +93,7 @@ dependencies { testCompile group: 'junit', name: 'junit', version: '4.12' testCompile group: 'com.google.code.gson', name: 'gson', version: '2.8.2' testCompile group: 'com.github.stefanbirkner', name: 'system-rules', version: '1.19.0' + testCompile group: 'io.github.hakky54', name: 'logcaptor', version: '2.3.1' forTheDLL 'com.microsoft.azure:applicationinsights-core:2.6.2' } diff --git a/core/src/main/java/com/microsoft/applicationinsights/internal/channel/common/TransmissionFileSystemOutput.java b/core/src/main/java/com/microsoft/applicationinsights/internal/channel/common/TransmissionFileSystemOutput.java index 98def3c36aa..fc2c4814b65 100644 --- a/core/src/main/java/com/microsoft/applicationinsights/internal/channel/common/TransmissionFileSystemOutput.java +++ b/core/src/main/java/com/microsoft/applicationinsights/internal/channel/common/TransmissionFileSystemOutput.java @@ -45,6 +45,7 @@ import java.util.concurrent.atomic.AtomicLong; import com.microsoft.applicationinsights.internal.channel.TransmissionOutputSync; +import com.microsoft.applicationinsights.internal.util.ExceptionStats; import com.microsoft.applicationinsights.internal.util.LimitsEnforcer; import com.microsoft.applicationinsights.internal.util.LocalFileSystemUtils; import org.apache.commons.io.FileUtils; @@ -84,7 +85,9 @@ public final class TransmissionFileSystemOutput implements TransmissionOutputSyn private final static int MAX_CAPACITY_MEGABYTES = 1000; private final static int MIN_CAPACITY_MEGABYTES = 1; private static final String MAX_TRANSMISSION_STORAGE_CAPACITY_NAME = "Channel.MaxTransmissionStorageCapacityInMB"; - + private static final ExceptionStats diskExceptionStats = new ExceptionStats( + TransmissionFileSystemOutput.class, + "Unable to store telemetry to disk (telemetry will be discarded):"); /// The folder in which we save transmission files private File folder; @@ -140,10 +143,7 @@ public boolean sendSync(Transmission transmission) { long currentSizeInBytes = size.get(); if (currentSizeInBytes >= capacityInBytes) { - logger.error("Persistent storage max capacity has been reached; " - + "currently at {} bytes. Telemetry will be lost, " - + "please consider increasing the value of MaxTransmissionStorageFilesCapacityInMB property in the configuration file.", - currentSizeInBytes); + diskExceptionStats.recordFailure("local storage capacity (" + capacityInBytes / (1024 * 1024) + "MB) has been exceeded"); return false; } @@ -161,6 +161,7 @@ public boolean sendSync(Transmission transmission) { } logger.debug("Data persisted to file. To be sent when the network is available."); + diskExceptionStats.recordSuccess(); return true; } @@ -278,7 +279,7 @@ private boolean renameToPermanentName(File tempTransmissionFile) { size.addAndGet(fileLength); return true; } catch (Exception e) { - logger.error("Rename To Permanent Name failed, exception: {}", e.toString()); + diskExceptionStats.recordFailure("unable to rename file to permanent name: " + e, e); } return false; @@ -305,7 +306,7 @@ private boolean saveTransmission(File transmissionFile, Transmission transmissio output.writeObject(transmission); return true; } catch (IOException e) { - logger.error("Failed to save transmission, exception: {}", e.toString()); + diskExceptionStats.recordFailure("unable to write to file: " + e, e); } return false; @@ -317,7 +318,7 @@ private Optional createTemporaryFile() { String prefix = TRANSMISSION_FILE_PREFIX + "-" + System.currentTimeMillis() + "-"; file = File.createTempFile(prefix, null, folder); } catch (IOException e) { - logger.error("Failed to create temporary file, exception: {}", e.toString()); + diskExceptionStats.recordFailure("unable to create temporary file: " + e, e); } return Optional.fromNullable(file); diff --git a/core/src/main/java/com/microsoft/applicationinsights/internal/channel/common/TransmissionNetworkOutput.java b/core/src/main/java/com/microsoft/applicationinsights/internal/channel/common/TransmissionNetworkOutput.java index dd80d1f1d80..dd95ce9b73e 100644 --- a/core/src/main/java/com/microsoft/applicationinsights/internal/channel/common/TransmissionNetworkOutput.java +++ b/core/src/main/java/com/microsoft/applicationinsights/internal/channel/common/TransmissionNetworkOutput.java @@ -21,12 +21,20 @@ package com.microsoft.applicationinsights.internal.channel.common; +import java.io.IOException; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import java.net.UnknownHostException; +import java.util.concurrent.atomic.AtomicBoolean; +import javax.annotation.Nullable; + import com.google.common.base.Preconditions; import com.microsoft.applicationinsights.TelemetryConfiguration; import com.microsoft.applicationinsights.customExceptions.FriendlyException; import com.microsoft.applicationinsights.internal.channel.TransmissionDispatcher; import com.microsoft.applicationinsights.internal.channel.TransmissionHandlerArgs; import com.microsoft.applicationinsights.internal.channel.TransmissionOutputSync; +import com.microsoft.applicationinsights.internal.util.ExceptionStats; import org.apache.commons.lang3.StringUtils; import org.apache.http.Header; import org.apache.http.HttpEntity; @@ -39,12 +47,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.annotation.Nullable; -import java.io.IOException; -import java.net.SocketException; -import java.net.UnknownHostException; -import java.util.concurrent.atomic.AtomicBoolean; - /** * The class is responsible for the actual sending of * {@link com.microsoft.applicationinsights.internal.channel.common.Transmission} @@ -56,7 +58,10 @@ public final class TransmissionNetworkOutput implements TransmissionOutputSync { private static final Logger logger = LoggerFactory.getLogger(TransmissionNetworkOutput.class); - private static volatile AtomicBoolean friendlyExceptionThrown = new AtomicBoolean(); + private static final AtomicBoolean friendlyExceptionThrown = new AtomicBoolean(); + private static final ExceptionStats networkExceptionStats = new ExceptionStats( + TransmissionNetworkOutput.class, + "Unable to send telemetry to the ingestion service (telemetry will be stored to disk):"); private static final String CONTENT_TYPE_HEADER = "Content-Type"; private static final String CONTENT_ENCODING_HEADER = "Content-Encoding"; @@ -173,35 +178,34 @@ public boolean sendSync(Transmission transmission) { // If we've completed then clear the back off flags as the channel does not need // to be throttled transmissionPolicyManager.clearBackoff(); + // Increment Success Counter + networkExceptionStats.recordSuccess(); } return true; - } catch (ConnectionPoolTimeoutException e) { - ex = e; - logger.error("Failed to send, connection pool timeout exception", e); + networkExceptionStats.recordFailure("connection pool timeout exception: " + e, e); } catch (SocketException e) { - ex = e; - logger.error("Failed to send, socket exception", e); + networkExceptionStats.recordFailure("socket exception: " + e, e); + } catch (SocketTimeoutException e) { + networkExceptionStats.recordFailure("socket timeout exception: " + e, e); } catch (UnknownHostException e) { - ex = e; - logger.error("Failed to send, wrong host address or cannot reach address due to network issues", e); - } catch (IOException ioe) { - ex = ioe; - logger.error("Failed to send", ioe); + networkExceptionStats.recordFailure("wrong host address or cannot reach address due to network issues: " + e, e); + } catch (IOException e) { + networkExceptionStats.recordFailure("I/O exception: " + e, e); } catch (FriendlyException e) { ex = e; + // TODO should this be merged into networkExceptionStats? if(!friendlyExceptionThrown.getAndSet(true)) { logger.error(e.getMessage()); } } catch (Exception e) { - ex = e; - logger.error("Failed to send, unexpected exception", e); + networkExceptionStats.recordFailure("unexpected exception: " + e, e); } catch (ThreadDeath td) { throw td; } catch (Throwable t) { ex = t; try { - logger.error("Failed to send, unexpected error", t); + networkExceptionStats.recordFailure("unexpected exception: " + t, t); } catch (ThreadDeath td) { throw td; } catch (Throwable t2) { @@ -214,7 +218,7 @@ public boolean sendSync(Transmission transmission) { httpClient.dispose(response); if (code == HttpStatus.SC_BAD_REQUEST) { - logger.error("Error sending data: {}", reason); + networkExceptionStats.recordFailure("ingestion service returned 400 (" + reason + ")"); } else if (code != HttpStatus.SC_OK) { // Invoke the listeners for handling things like errors // The listeners will handle the back off logic as well as the dispatch diff --git a/core/src/main/java/com/microsoft/applicationinsights/internal/util/ExceptionStats.java b/core/src/main/java/com/microsoft/applicationinsights/internal/util/ExceptionStats.java new file mode 100644 index 00000000000..0c6bf97d4fc --- /dev/null +++ b/core/src/main/java/com/microsoft/applicationinsights/internal/util/ExceptionStats.java @@ -0,0 +1,134 @@ +package com.microsoft.applicationinsights.internal.util; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// exception stats for a given 5-min window +// each instance represents a logical grouping of errors that a user cares about and can understand, +// e.g. sending telemetry to the portal, storing telemetry to disk, ... +public class ExceptionStats { + + private static final ScheduledExecutorService scheduledExecutor = + Executors.newSingleThreadScheduledExecutor(ThreadPoolUtils.createDaemonThreadFactory(ExceptionStats.class, "exception stats logger")); + + private final Logger logger; + private final String introMessage; + + // Period for scheduled executor in secs + private final int intervalSeconds; + + private final AtomicBoolean firstFailure = new AtomicBoolean(); + + // private final String groupingMessage; + + // number of successes and failures in the 5-min window + private long numSuccesses; + private long numFailures; + + // using MutableLong for two purposes + // * so we don't need to get and set into map each time we want to increment + // * avoid autoboxing for values above 128 + private Map warningMessages = new HashMap<>(); + + private final Object lock = new Object(); + + public ExceptionStats(Class source, String introMessage) { + this(source, introMessage, 300); + } + + // Primarily used by test + public ExceptionStats(Class source, String introMessage, int intervalSeconds) { + logger = LoggerFactory.getLogger(source); + this.introMessage = introMessage; + this.intervalSeconds = intervalSeconds; + } + + public void recordSuccess() { + synchronized (lock) { + numSuccesses++; + } + } + + // warningMessage should have low cardinality + public void recordFailure(String warningMessage) { + recordFailure(warningMessage, null); + } + + // warningMessage should have low cardinality + public void recordFailure(String warningMessage, @Nullable Throwable exception) { + if (!firstFailure.getAndSet(true)) { + // log the first time we see an exception as soon as it occurs, along with full stack trace + logger.warn(introMessage + " " + warningMessage + " (future failures will be aggregated and logged once every " + intervalSeconds / 60 + " minutes)", exception); + scheduledExecutor.scheduleAtFixedRate(new ExceptionStatsLogger(), intervalSeconds, intervalSeconds, TimeUnit.SECONDS); + return; + } + + logger.debug(introMessage + " " + warningMessage, exception); + + if (warningMessages.size() > 100) { + // we have a cardinality problem and don't want to consume too much memory or do too much logging + return; + } + synchronized (lock) { + warningMessages.computeIfAbsent(warningMessage, key -> new MutableLong()).increment(); + numFailures++; + } + } + + private static class MutableLong { + private long value; + private void increment() { + value++; + } + } + + public class ExceptionStatsLogger implements Runnable { + + @Override + public void run() { + long numSuccesses; + long numFailures; + Map warningMessages; + // grab quickly and reset under lock (do not perform logging under lock) + synchronized (lock) { + numSuccesses = ExceptionStats.this.numSuccesses; + numFailures = ExceptionStats.this.numFailures; + warningMessages = ExceptionStats.this.warningMessages; + + ExceptionStats.this.numSuccesses = 0; + ExceptionStats.this.numFailures = 0; + ExceptionStats.this.warningMessages = new HashMap<>(); + } + if (numFailures > 0) { + long numMinutes = ExceptionStats.this.intervalSeconds / 60; + long total = numSuccesses + numFailures; + StringBuilder message = new StringBuilder(); + message.append("In the last "); + message.append(numMinutes); + message.append(" minutes, the following operation has failed "); + message.append(numFailures); + message.append(" times (out of "); + message.append(total); + message.append("):\n"); + message.append(introMessage); + for (Map.Entry entry : warningMessages.entrySet()) { + message.append("\n * "); + message.append(entry.getKey()); + message.append(" ("); + message.append(entry.getValue().value); + message.append(" times)"); + } + logger.warn(message.toString()); + } + } + } +} diff --git a/core/src/test/java/com/microsoft/applicationinsights/internal/util/ExceptionStatsLoggerTest.java b/core/src/test/java/com/microsoft/applicationinsights/internal/util/ExceptionStatsLoggerTest.java new file mode 100644 index 00000000000..84995993deb --- /dev/null +++ b/core/src/test/java/com/microsoft/applicationinsights/internal/util/ExceptionStatsLoggerTest.java @@ -0,0 +1,35 @@ +package com.microsoft.applicationinsights.internal.util; + +import nl.altindag.log.LogCaptor; +import org.junit.*; + +import static org.junit.Assert.*; + +public class ExceptionStatsLoggerTest { + private static ExceptionStats networkExceptionStats; + + @BeforeClass + public static void setUp() { + // one-time initialization code + networkExceptionStats = new ExceptionStats(ExceptionStatsLoggerTest.class, "intro:", 1); + } + + @Test + public void testWarnAndExceptionsAreLogged() throws InterruptedException { + LogCaptor logCaptor = LogCaptor.forClass(ExceptionStatsLoggerTest.class); + networkExceptionStats.recordSuccess(); + Exception ex=new IllegalArgumentException(); + networkExceptionStats.recordFailure("Test Message",ex); + networkExceptionStats.recordFailure("Test Message2",ex); + networkExceptionStats.recordFailure("Test Message2",ex); + networkExceptionStats.recordFailure("Test Message3",ex); + //wait for more than 1 second + Thread.sleep(3000); + assertEquals(2,logCaptor.getWarnLogs().size()); + assertTrue(logCaptor.getWarnLogs().get(0).contains("intro: Test Message (future failures will be aggregated and logged once every 0 minutes)")); + assertTrue(logCaptor.getWarnLogs().get(1).contains("In the last 0 minutes, the following operation has failed 3 times (out of 4):\n" + + "intro:\n" + + " * Test Message2 (2 times)\n" + + " * Test Message3 (1 times)")); + } +}