diff --git a/alert-policies/apache-hadoop/NameNodeDeadDataNodes.yml b/alert-policies/apache-hadoop/NameNodeDeadDataNodes.yml new file mode 100644 index 0000000000..4663c3eaf5 --- /dev/null +++ b/alert-policies/apache-hadoop/NameNodeDeadDataNodes.yml @@ -0,0 +1,32 @@ +# Name of the alert +name: Dead DataNodes + +# Description and details +description: |+ + This alert is triggered if the dead DataNodes exceeds 1 for 5 minutes. +# Type of alert +type: STATIC + +# NRQL query +nrql: + + query: "SELECT latest(NumDeadDataNodes) AS 'Dead Data Nodes' FROM HadoopNameNodeSample" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file diff --git a/alert-policies/apache-hadoop/NameNodeMissingBlocks.yml b/alert-policies/apache-hadoop/NameNodeMissingBlocks.yml new file mode 100644 index 0000000000..4507bf4695 --- /dev/null +++ b/alert-policies/apache-hadoop/NameNodeMissingBlocks.yml @@ -0,0 +1,41 @@ +# Name of the alert +name: Missing Blocks in HDFS + +# Description and details +description: |+ + This alert is triggered if the missing blocks exceeds 3 for 5 minutes. +# Type of alert +type: STATIC + +# NRQL query +nrql: + + query: "SELECT latest(NumberOfMissingBlocks) AS 'Missing Blocks' FROM HadoopNameNodeSample" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 3 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + - priority: WARNING + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file diff --git a/alert-policies/apache-hadoop/NameNodeVolumeFailures.yml b/alert-policies/apache-hadoop/NameNodeVolumeFailures.yml new file mode 100644 index 0000000000..05d0248609 --- /dev/null +++ b/alert-policies/apache-hadoop/NameNodeVolumeFailures.yml @@ -0,0 +1,41 @@ +# Name of the alert +name: NameNode Volume Failures + +# Description and details +description: |+ + This alert is triggered if the volume failures exceeds 2 for 5 minutes. +# Type of alert +type: STATIC + +# NRQL query +nrql: + + query: "SELECT latest(VolumeFailuresTotal) AS 'Volume Failures' FROM HadoopNameNodeSample" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 2 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + - priority: WARNING + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file diff --git a/alert-policies/apache-hadoop/UsedDiskPercentage.yml b/alert-policies/apache-hadoop/UsedDiskPercentage.yml new file mode 100644 index 0000000000..a784a1e625 --- /dev/null +++ b/alert-policies/apache-hadoop/UsedDiskPercentage.yml @@ -0,0 +1,41 @@ +# Name of the alert +name: Used Disk Percent + +# Description and details +description: |+ + This alert is triggered when used disk space exceeds 90% for at least 5 minutes. +# Type of alert +type: STATIC + +# NRQL query +nrql: + + query: "SELECT latest(PercentUsed) FROM HadoopNameNodeSampleMetrics" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: BELOW + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + - priority: WARNING + # Operator used to compare against the threshold. + operator: BELOW + # Value that triggers a violation + threshold: 85 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file diff --git a/dashboards/apache-hadoop/apache-hadoop-01.png b/dashboards/apache-hadoop/apache-hadoop-01.png new file mode 100644 index 0000000000..6325f356d7 Binary files /dev/null and b/dashboards/apache-hadoop/apache-hadoop-01.png differ diff --git a/dashboards/apache-hadoop/apache-hadoop-02.png b/dashboards/apache-hadoop/apache-hadoop-02.png new file mode 100644 index 0000000000..754a5789e4 Binary files /dev/null and b/dashboards/apache-hadoop/apache-hadoop-02.png differ diff --git a/dashboards/apache-hadoop/apache-hadoop-03.png b/dashboards/apache-hadoop/apache-hadoop-03.png new file mode 100644 index 0000000000..fce2167c86 Binary files /dev/null and b/dashboards/apache-hadoop/apache-hadoop-03.png differ diff --git a/dashboards/apache-hadoop/apache-hadoop-04.png b/dashboards/apache-hadoop/apache-hadoop-04.png new file mode 100644 index 0000000000..c312568ba4 Binary files /dev/null and b/dashboards/apache-hadoop/apache-hadoop-04.png differ diff --git a/dashboards/apache-hadoop/apache-hadoop-05.png b/dashboards/apache-hadoop/apache-hadoop-05.png new file mode 100644 index 0000000000..16c74ea159 Binary files /dev/null and b/dashboards/apache-hadoop/apache-hadoop-05.png differ diff --git a/dashboards/apache-hadoop/apache-hadoop-06.png b/dashboards/apache-hadoop/apache-hadoop-06.png new file mode 100644 index 0000000000..00c5e2d54e Binary files /dev/null and b/dashboards/apache-hadoop/apache-hadoop-06.png differ diff --git a/dashboards/apache-hadoop/apache-hadoop-07.png b/dashboards/apache-hadoop/apache-hadoop-07.png new file mode 100644 index 0000000000..53b82b289b Binary files /dev/null and b/dashboards/apache-hadoop/apache-hadoop-07.png differ diff --git a/dashboards/apache-hadoop/apache-hadoop.json b/dashboards/apache-hadoop/apache-hadoop.json new file mode 100644 index 0000000000..c791b17ec8 --- /dev/null +++ b/dashboards/apache-hadoop/apache-hadoop.json @@ -0,0 +1,1484 @@ +{ + "name": "Apache Hadoop", + "description": null, + "pages": [ + { + "name": "Overview", + "description": null, + "widgets": [ + { + "title": "", + "layout": { + "column": 1, + "row": 1, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "![logo](https://upload.wikimedia.org/wikipedia/commons/3/38/Hadoop_logo_new.svg)" + } + }, + { + "title": "NameNode Dead DataNodes", + "layout": { + "column": 3, + "row": 1, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(NumDeadDataNodes) AS 'dead nodes' FROM HadoopNameNodeSampleMetrics " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "CRITICAL", + "value": 1 + } + ] + } + }, + { + "title": "NameNode Volume Failures", + "layout": { + "column": 5, + "row": 1, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(VolumeFailuresTotal) AS 'volume failures' FROM HadoopNameNodeSampleMetrics" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "CRITICAL", + "value": 2 + }, + { + "alertSeverity": "WARNING", + "value": 1 + } + ] + } + }, + { + "title": "NameNode Missing Blocks", + "layout": { + "column": 7, + "row": 1, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(NumberOfMissingBlocks) AS 'missing blocks' FROM HadoopNameNodeSampleMetrics " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "CRITICAL", + "value": 3 + }, + { + "alertSeverity": "WARNING", + "value": 0.9 + } + ] + } + }, + { + "title": "Used Disk Percent (%)", + "layout": { + "column": 9, + "row": 1, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(PercentUsed) AS '(%) disk usage' FROM HadoopNameNodeSampleMetrics " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": 90 + }, + { + "alertSeverity": "CRITICAL", + "value": 95 + } + ] + } + }, + { + "title": "Stale Data Nodes", + "layout": { + "column": 11, + "row": 1, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(NumStaleDataNodes) AS 'stale data nodes' FROM HadoopNameNodeSampleMetrics WHERE NumStaleDataNodes != 0" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Apache Hadoop Version - Please select the entity name", + "layout": { + "column": 1, + "row": 2, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(SoftwareVersion) AS '' FROM HadoopNameNodeSampleMetrics WHERE entityName = {{entity_name}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Hadoop Node Manager Status", + "layout": { + "column": 3, + "row": 2, + "width": 5, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "colors": { + "seriesOverrides": [ + { + "color": "#31ce1c", + "seriesName": "Active nodes" + }, + { + "color": "#dd1313", + "seriesName": "Lost nodes" + } + ] + }, + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(NumActiveNMs) AS 'Active nodes', latest(NumLostNMs) AS 'Lost nodes' FROM HadoopResourceManagerMetrics TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Cache Used ", + "layout": { + "column": 8, + "row": 2, + "width": 5, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(CacheUsed) AS 'Cache used' FROM HadoopNameNodeSampleMetrics TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "", + "layout": { + "column": 1, + "row": 4, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "\n\n**About**\n\nInstrument your application with New Relic - [Add Data](https://one.newrelic.com/marketplace?account=3627899&state=70385b17-af48-cb31-24f8-fa1e82a0ea5f).\n\nUnable to find data in your dashboard? -[Troubleshoot here](\nhttps://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/flex-integration-tool-build-your-own-integration/?q=prometheus)\n\n[Please rate this dashboard](https://docs.google.com/forms/d/e/1FAIpQLSclR38J8WbbB2J1tHnllKUkzWZkJhf4SrJGyavpMd4t82NjnQ/viewform?usp=pp_url&entry.1615922415=ApacheHadoop) here and let us know how we can improve it for you.\n" + } + }, + { + "title": "", + "layout": { + "column": 1, + "row": 6, + "width": 12, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "# Hadoop health\nFind valuable insights into heap memory, non-heap memory, blocks, files, and threads with New Relic Hadoop." + } + }, + { + "title": "Heap Memory Statistics", + "layout": { + "column": 1, + "row": 7, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(MemHeapUsedM) AS 'Memory heap used MB', latest(MemHeapMaxM) AS 'Memory heap max MB', latest(MemHeapCommittedM) AS 'Memory heap committed MB' FROM HadoopNameNodeSampleMetrics TIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Non Heap Memory Statistics", + "layout": { + "column": 5, + "row": 7, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(MemNonHeapUsedM) AS 'Memory non heap used MB', latest(MemNonHeapMaxM) AS 'Memory non heap max MB', latest(MemNonHeapCommittedM) AS 'Memory non heap committed MB' FROM HadoopNameNodeSampleMetrics TIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Files Statistics", + "layout": { + "column": 9, + "row": 7, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(FilesTotal), latest(FilesCreated), latest(FilesDeleted), latest(FilesRenamed) FROM HadoopNameNodeSampleMetrics TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Blocks Statistics", + "layout": { + "column": 1, + "row": 10, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(BlocksTotal) AS 'Total blocks', latest(BlockCapacity) AS 'Block capacity', latest(CorruptBlocks) AS 'Corrupt blocks', latest(ExcessBlocks) AS 'Excess blocks', latest(MissingBlocks) AS 'Missing blocks', latest(UnderReplicatedBlocks) AS 'Replicated blocks' FROM HadoopNameNodeSampleMetrics TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Thread Statistics", + "layout": { + "column": 5, + "row": 10, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(Threads), latest(ThreadsBlocked) AS 'Threads blocked', latest(ThreadsRunnable) AS 'Threads runnable', latest(ThreadsNew) AS 'Threads new', latest(ThreadsTerminated) AS 'Threads terminated', latest(ThreadsWaiting) AS 'Threads waiting' FROM HadoopNameNodeSampleMetrics TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Log Statistics", + "layout": { + "column": 9, + "row": 10, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(LogError) AS 'Log error', latest(LogFatal) AS 'Log fatal', latest(LogInfo) AS 'Log info', latest(LogWarn) AS 'Log warn' FROM HadoopNameNodeSampleMetrics TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + }, + { + "name": "HDFS", + "description": null, + "widgets": [ + { + "title": "", + "layout": { + "column": 1, + "row": 1, + "width": 12, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "# HDFS - (Hadoop Distributed File System)\nIt is crucial to monitor essential HDFS metrics as it enables you to keep track of DFS capacity, available space, block status, and optimize data storage.\n" + } + }, + { + "title": "Hadoop Nodes Status", + "layout": { + "column": 1, + "row": 2, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(NumLiveDataNodes) AS 'Live nodes', latest(NumDeadDataNodes) AS 'Dead nodes' FROM HadoopNameNodeSampleMetrics TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "HDFS Disk Spaces", + "layout": { + "column": 7, + "row": 2, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(CapacityTotalGB) as 'Capacity total GB' , latest(CapacityUsedGB) as 'Capacity used GB', latest(CapacityRemainingGB) as 'Capacity remaining GB', latest(NonDfsUsedSpace)/1e+9\n as 'Non Dfs used space GB' FROM HadoopNameNodeSampleMetrics TIMESERIES AUTO " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + } + ] + }, + { + "name": "Yarn Cluster", + "description": null, + "widgets": [ + { + "title": "", + "layout": { + "column": 1, + "row": 1, + "width": 12, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "# Yarn Cluster\nMonitoring the YARN cluster allows you to closely track the allocation and consumption of resources, ensuring that workloads are distributed effectively and efficiently across the cluster.\n" + } + }, + { + "title": "Active Users", + "layout": { + "column": 1, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(ActiveUsers) AS 'active users' FROM HadoopResourceManagerMetrics WHERE ActiveUsers !=0" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "value": 0 + } + ] + } + }, + { + "title": "Allocated Containers", + "layout": { + "column": 3, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(AllocatedContainers) AS 'allocated containers' FROM HadoopResourceManagerMetrics WHERE AllocatedContainers !=0" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "value": 0 + } + ] + } + }, + { + "title": "Pending Containers", + "layout": { + "column": 5, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(PendingContainers) AS 'pending containers' FROM HadoopResourceManagerMetrics WHERE PendingContainers !=0" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": -0.9 + } + ] + } + }, + { + "title": "Reserved Containers", + "layout": { + "column": 7, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(ReservedContainers) AS 'reserved containers' FROM HadoopResourceManagerMetrics" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Rebooted", + "layout": { + "column": 9, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(NumRebootedNMs) AS 'rebooted nodes' FROM HadoopResourceManagerMetrics" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Available V Cores", + "layout": { + "column": 11, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(AvailableVCores) AS 'available VCores' FROM HadoopResourceManagerMetrics WHERE AvailableVCores != 0" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "value": 0 + } + ] + } + }, + { + "title": "Allocated Memory", + "layout": { + "column": 1, + "row": 3, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(AllocatedMB) AS 'MB' FROM HadoopResourceManagerMetrics TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "zero": true + } + } + }, + { + "title": "Virtual Cores", + "layout": { + "column": 7, + "row": 3, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(PendingVCores), latest(ReservedVCores), latest(AllocatedVCores), latest(AvailableVCores) FROM HadoopResourceManagerMetrics TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Memory Usage", + "layout": { + "column": 1, + "row": 6, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(MemoryUsed) AS 'MB' FROM HadoopNameNodeSampleMetrics TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Garbage Collection Time", + "layout": { + "column": 7, + "row": 6, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(GcTimeMillis) AS 'Milliseconds'FROM HadoopResourceManagerMetrics TIMESERIES AUTO " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + }, + { + "name": "AM", + "description": null, + "widgets": [ + { + "title": "", + "layout": { + "column": 1, + "row": 1, + "width": 12, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "# Application Master\nUse the Application Master (AM) monitor to track the status of your applications. This monitor provides metrics for killed, failed, pending, and running data, which can help you effectively manage and optimize your application performance.\n" + } + }, + { + "title": "Apps Running", + "layout": { + "column": 1, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(AppsRunning) AS 'apps running' FROM HadoopResourceManagerMetrics WHERE AppsRunning !=0" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "value": 0 + } + ] + } + }, + { + "title": "Apps Completed", + "layout": { + "column": 3, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(AppsCompleted) AS 'apps completed' FROM HadoopResourceManagerMetrics WHERE AppsCompleted != 0" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "value": 0 + } + ] + } + }, + { + "title": "Apps Submitted", + "layout": { + "column": 5, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(AppsSubmitted) AS 'apps submitted' FROM HadoopResourceManagerMetrics WHERE AppsSubmitted != 0" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "value": 0 + } + ] + } + }, + { + "title": "Apps Pending", + "layout": { + "column": 7, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(AppsPending) AS 'apps pending' FROM HadoopResourceManagerMetrics WHERE AppsPending !=0" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "WARNING", + "value": -0.9 + } + ] + } + }, + { + "title": "Apps Killed", + "layout": { + "column": 9, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(AppsKilled) AS 'apps killed' FROM HadoopResourceManagerMetrics" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "CRITICAL", + "value": 1 + }, + { + "value": 0 + } + ] + } + }, + { + "title": "Apps Failed", + "layout": { + "column": 11, + "row": 2, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(AppsFailed) AS 'apps failed' FROM HadoopResourceManagerMetrics WHERE AppsFailed != 0" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": [ + { + "alertSeverity": "CRITICAL", + "value": -0.9 + } + ] + } + }, + { + "title": "AM Resource Statistics", + "layout": { + "column": 1, + "row": 3, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(AMResourceLimitMB) AS 'AM Resource limit MB', latest(AMResourceLimitVCores) AS 'AM Resource limit V cores', latest(UsedAMResourceMB) AS 'Used AM resource MB', latest(UsedAMResourceVCores) AS 'Used AM resource V cores' FROM HadoopResourceManagerMetrics TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Average Delay Time", + "layout": { + "column": 7, + "row": 3, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(AMLaunchDelayAvgTime) AS 'Milliseconds' FROM HadoopResourceManagerMetrics TIMESERIES " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + }, + { + "name": "Infrastructure", + "description": null, + "widgets": [ + { + "title": "CPU Usage (%)", + "layout": { + "column": 1, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(host.cpuPercent) AS 'CPU Used %' FROM Metric TIMESERIES auto" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "yAxisLeft": { + "max": 100, + "min": 0, + "zero": true + } + } + }, + { + "title": "Memory Usage (%)", + "layout": { + "column": 5, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(host.memoryUsedPercent) AS 'Memory used %' FROM Metric TIMESERIES auto" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Storage Usage (%)", + "layout": { + "column": 9, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(host.disk.usedPercent) as 'Storage used %' FROM Metric TIMESERIES auto" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Network Traffic", + "layout": { + "column": 1, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(host.net.transmitBytesPerSecond) AS 'Transmit bytes per second', average(host.net.receiveBytesPerSecond) AS 'Receive bytes per second' FROM Metric TIMESERIES auto" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Load Average", + "layout": { + "column": 5, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(host.loadAverageOneMinute) as '1 minute', average(host.loadAverageFiveMinute) AS '5 minutes', average(host.loadAverageFifteenMinute) AS '15 minutes' FROM Metric TIMESERIES auto" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Processes Running", + "layout": { + "column": 9, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(host.process.cpuPercent) as 'CPU %', latest(host.process.threadCount) as 'Threads' FROM Metric FACET processId, processDisplayName ORDER BY cpuPercent asc LIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + }, + { + "name": "Logs", + "description": null, + "widgets": [ + { + "title": "", + "layout": { + "column": 1, + "row": 1, + "width": 12, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "## Choose variable\nSelect your appropriate \"logtype\" variable(s) to display the related logs." + } + }, + { + "title": "Apache Hadoop Logs", + "layout": { + "column": 1, + "row": 2, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "logger.log-table-widget" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT * FROM Log WHERE logtype = {{log_type}}" + } + ] + } + } + ] + } + ], + "variables": [ + { + "name": "log_type", + "items": null, + "defaultValues": [ + { + "value": { + "string": "hadoop_namenode_logs" + } + }, + { + "value": { + "string": "hadoop_resourcemanager_logs" + } + }, + { + "value": { + "string": "hadoop_secondarynamenode_logs" + } + } + ], + "nrqlQuery": { + "accountIds": [], + "query": "SELECT uniques(logtype) FROM Log since 1 month ago" + }, + "title": "Select your logtype", + "type": "NRQL", + "isMultiSelection": true, + "replacementStrategy": "STRING" + }, + { + "name": "entity_name", + "items": null, + "defaultValues": [], + "nrqlQuery": { + "accountIds": [], + "query": "SELECT uniques(entityName) FROM HadoopNameNodeSampleMetrics SINCE 3 MONTHS AGO " + }, + "title": "Entity Name", + "type": "NRQL", + "isMultiSelection": false, + "replacementStrategy": "STRING" + } + ] +} diff --git a/data-sources/apache-hadoop/config.yml b/data-sources/apache-hadoop/config.yml new file mode 100644 index 0000000000..95b13224d3 --- /dev/null +++ b/data-sources/apache-hadoop/config.yml @@ -0,0 +1,12 @@ +id: apache-hadoop +displayName: Apache Hadoop +description: | + Optimize your Apache Hadoop performance using New Relic Apache Hadoop monitoring. +icon: logo.png +install: + primary: + link: + url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/apache-hadoop-monitoring-integration/ +keywords: + - apache-hadoop + - apache \ No newline at end of file diff --git a/data-sources/apache-hadoop/logo.png b/data-sources/apache-hadoop/logo.png new file mode 100644 index 0000000000..9e260fbf53 Binary files /dev/null and b/data-sources/apache-hadoop/logo.png differ diff --git a/quickstarts/apache-hadoop/config.yml b/quickstarts/apache-hadoop/config.yml new file mode 100644 index 0000000000..29f17004dd --- /dev/null +++ b/quickstarts/apache-hadoop/config.yml @@ -0,0 +1,43 @@ + +slug: apache-hadoop +description: | + ## Comprehensive monitoring quickstart for Apache Hadoop + Apache Hadoop metrics are statistical data that Hadoop daemons offer and are used for performance adjustment, debugging, and monitoring. By integrating Apache Hadoop with New Relic, you can get comprehensive information about the workflow by visualizing and gathering metrics and logs. + + ## Why monitor Apache Hadoop? + Apache Hadoop is an open-source software package which enables distributed storage and processing of massive data by leveraging a network of computers. It utilizes the MapReduce programming model-based distributed storage and processing of massive data. + + By leveraging New Relic, you can gain a more in-depth understanding of Apache Hadoop performance and health. Monitoring Apache Hadoop with New Relic will help you better understand the HDFS (Hadoop Distributed File System), blocks, system load, data nodes, NodeManager, and jobs. + + ## What’s included in this quickstart? + New Relic Apache Hadoop monitoring quickstart provides quality out-of-the-box reporting: + - JVM metrics + - NameNode metrics + - DataNode metrics + - Cluster metrics + - Queue metrics + - Node manager metrics + - Infrastructure metrics + +summary: | + Optimize your Apache Hadoop performance using New Relic Apache Hadoop monitoring. +icon: logo.png +level: New Relic +authors: + - New Relic +title: Apache Hadoop +documentation: + - name: Apache Hadoop integration documentation + description: | + Monitor the performance metrics of your Apache Hadoop instances in real-time with New Relic Apache Hadoop. + url: >- + https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/apache-hadoop-monitoring-integration/ +keywords: + - apache-hadoop + - apache +dashboards: + - apache-hadoop +alertPolicies: + - apache-hadoop +dataSourceIds: + - apache-hadoop diff --git a/quickstarts/apache-hadoop/logo.png b/quickstarts/apache-hadoop/logo.png new file mode 100644 index 0000000000..9e260fbf53 Binary files /dev/null and b/quickstarts/apache-hadoop/logo.png differ