From 83d0cb2a52a4c86ab75e9071313ce63221dbe4bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E7=AC=A8=E8=9B=8B?= <105542329+a-little-fool@users.noreply.github.com> Date: Thu, 20 Jul 2023 17:50:08 +0800 Subject: [PATCH] feature: support apache spark metrics monitoring (#1114) --- home/docs/help/spark.md | 128 ++++++++ .../src/main/resources/define/app-spark.yml | 301 ++++++++++++++++++ 2 files changed, 429 insertions(+) create mode 100644 home/docs/help/spark.md create mode 100644 manager/src/main/resources/define/app-spark.yml diff --git a/home/docs/help/spark.md b/home/docs/help/spark.md new file mode 100644 index 00000000000..3d4b44828ea --- /dev/null +++ b/home/docs/help/spark.md @@ -0,0 +1,128 @@ +--- +id: spark +title: Monitoring Spark +sidebar_label: Spark Monitor +keywords: [open source monitoring tool, open source java spark monitoring tool, monitoring spark metrics] +--- + +> Collect and monitor the general performance Metrics of Spark. + +**Protocol Use:JMX** + +### Spark App Enable JMX Protocol + +1. Add Spark `VM options` When Start Server ⚠️ customIP + +Refer: https://spark.apache.org/docs/latest/spark-standalone.html + + +**监控配置spark的监控主要分为Master、Worker、driver、executor监控。Master和Worker的监控在spark集群运行时即可监控,Driver和Excutor的监控需要针对某一个app来进行监控。** +**如果都要监控,需要根据以下步骤来配置** + + + +## 第一步 + +**修改$SPARK_HOME/conf/spark-env.sh,添加以下语句:** + +```shell +# JMX Port to use +SPARK_DAEMON_JAVA_OPTS="-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" + +# export SPARK_DAEMON_JAVA_OPTS="$SPARK_DAEMON_JAVA_OPTS -Dcom.sun.management.jmxremote.port=$JMX_PORT " +export SPARK_DAEMON_JAVA_OPTS="-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.port=8712 " +``` + +语句中有$JMX_PORT,这个的值可以自定义,也可以获取一个随机数作为端口号。 +如果端口自定义为一个具体的值,而 spark 的 Master 和其中之一的 Worker 在同一台机器上,会出现端口冲突的情况。 + + + +## 第二步 + +**vim $SPARK_HOME/conf/metrics.properties 添加如下内容** + +```shell +*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink +master.source.jvm.class=org.apache.spark.metrics.source.JvmSource +worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource +driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource +executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource +``` + + + + + +## 第三步 + +**vim $SPARK_HOME/conf/spark-defaults.conf,添加以下项为driver和executor设置监控端口,在有程序运行的情况下,此端口会被打开。** + +```shell +spark.metrics.conf /opt/bigdata/spark/conf/metrics.properties +spark.driver.extraJavaOptions -XX:+PrintGCDetails -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.mana +gement.jmxremote.port=8712 + +spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.mana +gement.jmxremote.port=8711 +``` + +在spark的Master和Worker正常运行以及spark-submit提交了一个程序的情况下,可以从linux中查询出端口号码。 + + + +### Configuration parameter + +| Parameter name | Parameter help description | +|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Monitoring Host | Monitored IPV4, IPV6 or domain name. Note⚠️Without protocol header (eg: https://, http://) | +| Monitoring name | Identify the name of this monitoring. The name needs to be unique | +| Port | Port provided by JMX | +| Username | JMX connection user name, optional | +| Password | JMX connection password, optional | +| Collection interval | Interval time of monitor periodic data collection, unit: second, and the minimum interval that can be set is 30 seconds | +| Whether to detect | Whether to detect and check the availability of monitoring before adding monitoring. Adding and modifying operations will continue only after the detection is successful | +| Description remarks | For more information about identifying and describing this monitoring, users can note information here | + +### Collection Metrics + +#### Metrics Set:memory_pool + +| Metric name | Metric unit | Metric help description | +|-------------|-------------|-------------------------| +| name | | metrics name | +| committed | kb | total size | +| init | kb | init size | +| max | kb | max size | +| used | kb | used size | + +#### Metrics Set:code_cache (Only Support JDK8) + +| Metric name | Metric unit | Metric help description | +|-------------|-------------|-------------------------| +| committed | kb | total size | +| init | kb | init size | +| max | kb | max size | +| used | kb | used size | + +#### Metrics Set:class_loading + +| Metric name | Metric unit | Metric help description | +|-----------------------|-------------|--------------------------| +| LoadedClassCount | | Loaded Class Count | +| TotalLoadedClassCount | | Total Loaded Class Count | +| UnloadedClassCount | | Unloaded Class Count | + + +#### Metrics Set:thread + +| Metric name | Metric unit | Metric help description | +|-------------------------|-------------|----------------------------| +| TotalStartedThreadCount | | Total Started Thread Count | +| ThreadCount | | Thread Count | +| PeakThreadCount | | Peak Thread Count | +| DaemonThreadCount | | Daemon Thread Count | +| CurrentThreadUserTime | ms | Current Thread User Time | +| CurrentThreadCpuTime | ms | Current Thread Cpu Time | + + diff --git a/manager/src/main/resources/define/app-spark.yml b/manager/src/main/resources/define/app-spark.yml new file mode 100644 index 00000000000..0e2515bdf4a --- /dev/null +++ b/manager/src/main/resources/define/app-spark.yml @@ -0,0 +1,301 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The monitoring type category:service-application service monitoring db-database monitoring custom-custom monitoring os-operating system monitoring +# 监控类型所属类别:service-应用服务监控 db-数据库监控 custom-自定义监控 os-操作系统监控 cn-云原生cloud native network-网络监控 mid-中间件 +category: mid +# The monitoring type eg: linux windows tomcat mysql aws... +# 监控类型 eg: linux windows tomcat mysql aws... +app: spark +# The monitoring i18n name +# 监控类型国际化名称 +name: + zh-CN: Apache Spark + en-US: Apache Spark +# Input params define for monitoring(render web ui by the definition) +# 监控所需输入参数定义(根据定义渲染页面UI) +params: + # field-param field key + # field-字段名称标识符 + - field: host + # name-param field display i18n name + # name-参数字段显示名称 + name: + zh-CN: 主机Host + en-US: Host + # type-param field type(most mapping the html input type) + # type-字段类型,样式(大部分映射input标签type属性) + type: host + # required-true or false + # 是否是必输项 true-必填 false-可选 + required: true + # field-param field key + # field-变量字段标识符 + - field: port + # name-param field display i18n name + # name-参数字段显示名称 + name: + zh-CN: 端口 + en-US: Port + # type-param field type(most mapping the html input type) + # type-字段类型,样式(大部分映射input标签type属性) + type: number + # when type is number, range is required + # 当type为number时,用range表示范围 + range: '[0,65535]' + # required-true or false + # required-是否是必输项 true-必填 false-可选 + required: true + # default value + # 端口默认值 + defaultValue: 8712 + # field-param field key + # field-变量字段标识符 + - field: url + # name-param field display i18n name + # name-参数字段显示名称 + name: + zh-CN: JMX URL + en-US: JMX URL + # type-param field type(most mapping the html input type) + # type-字段类型,样式(大部分映射input标签type属性) + type: text + # required-true or false + # required-是否是必输项 true-必填 false-可选 + required: false + # hide param-true or false + # 是否隐藏字段 true or false + hide: true + # param field input placeholder + # 参数输入框提示信息 + placeholder: 'service:jmx:rmi:///jndi/rmi://host:port/jmxrmi' + # field-param field key + # field-变量字段标识符 + - field: username + # name-param field display i18n name + # name-参数字段显示名称 + name: + zh-CN: 用户名 + en-US: Username + # type-param field type(most mapping the html input type) + # type-字段类型,样式(大部分映射input标签type属性) + type: text + # when type is text, use limit to limit string length + # 当type为text时,用limit表示字符串限制大小 + limit: 20 + # required-true or false + # required-是否是必输项 true-必填 false-可选 + required: false + # hide param-true or false + # 是否隐藏字段 true or false + hide: true + # field-param field key + # field-变量字段标识符 + - field: password + # name-param field display i18n name + # name-参数字段显示名称 + name: + zh-CN: 密码 + en-US: Password + # type-param field type(most mapping the html input tag) + # type-字段类型,样式(大部分映射input标签type属性) + type: password + # required-true or false + # required-是否是必输项 true-必填 false-可选 + required: false + # hide param-true or false + # 是否隐藏字段 true or false + hide: true +# collect metrics config list +# 采集指标组配置列表 +metrics: + # metrics - basic + # 监控指标组 - basic + - name: basic + # metrics group scheduling priority(0->127)->(high->low), metrics with the same priority will be scheduled in parallel + # priority 0's metrics group is availability metrics, it will be scheduled first, only availability metrics collect success will the scheduling continue + # 指标组调度优先级(0->127)->(优先级高->低) 优先级低的指标组会等优先级高的指标组采集完成后才会被调度, 相同优先级的指标组会并行调度采集 + # 优先级为0的指标组为可用性指标组,即它会被首先调度,采集成功才会继续调度其它指标组,采集失败则中断调度 + priority: 0 + # collect metrics content + # 具体监控指标列表 + fields: + # field-metric name, type-metric type(0-number,1-string), unit-metric unit('%','ms','MB'), instance-if is metrics group unique identifier + # field-指标名称, type-指标类型(0-number数字,1-string字符串), unit-指标单位('%','ms','MB'), instance-是否是指标集合唯一标识符字段 + - field: VmName + type: 1 + - field: VmVendor + type: 1 + - field: VmVersion + type: 1 + - field: Uptime + type: 0 + unit: ms + # the protocol used for monitoring, eg: sql, ssh, http, telnet, wmi, snmp, sdk + # 用于监控的协议,例: sql, ssh, http, telnet, wmi, snmp, sdk + protocol: jmx + # the config content when protocol is jmx + jmx: + # host: ipv4 ipv6 domain + # 主机host: ipv4 ipv6 域名 + host: ^_^host^_^ + # port + # 端口 + port: ^_^port^_^ + username: ^_^username^_^ + password: ^_^password^_^ + # jmx mbean object name + # jmx mbean 对象名称 + objectName: java.lang:type=Runtime + url: ^_^url^_^ + + - name: memory_pool + priority: 1 + fields: + - field: name + type: 1 + instance: true + - field: committed + type: 0 + unit: MB + - field: init + type: 0 + unit: MB + - field: max + type: 0 + unit: MB + - field: used + type: 0 + unit: MB + units: + - committed=B->MB + - init=B->MB + - max=B->MB + - used=B->MB + # (optional)metrics field alias name, it is used as an alias field to map and convert the collected data and metrics field + # (可选)监控指标别名, 做为中间字段与采集数据字段和指标字段映射转换 + aliasFields: + - Name + - Usage->committed + - Usage->init + - Usage->max + - Usage->used + # mapping and conversion expressions, use these and aliasField above to calculate metrics value + # (可选)指标映射转换计算表达式,与上面的别名一起作用,计算出最终需要的指标值 + # eg: cores=core1+core2, usage=usage, waitTime=allTime-runningTime + calculates: + - name=Name + - committed=Usage->committed + - init=Usage->init + - max=Usage->max + - used=Usage->used + protocol: jmx + jmx: + # host: ipv4 ipv6 domain + # 主机host: ipv4 ipv6 域名 + host: ^_^host^_^ + # port + # 端口 + port: ^_^port^_^ + username: ^_^username^_^ + password: ^_^password^_^ + objectName: java.lang:type=MemoryPool,name=* + url: ^_^url^_^ + + - name: code_cache + priority: 5 + fields: + - field: committed + type: 0 + - field: init + type: 0 + - field: max + type: 0 + - field: used + type: 0 + aliasFields: + - Usage->committed + - Usage->init + - Usage->max + - Usage->used + calculates: + - committed=Usage->committed + - init=Usage->init + - max=Usage->max + - used=Usage->used + protocol: jmx + jmx: + # host: ipv4 ipv6 domain + # 主机host: ipv4 ipv6 域名 + host: ^_^host^_^ + # port + # 端口 + port: ^_^port^_^ + username: ^_^username^_^ + password: ^_^password^_^ + objectName: java.lang:type=MemoryPool,name=Code Cache + url: ^_^url^_^ + + - name: class_loading + priority: 6 + # collect metrics content + # 具体监控指标列表 + fields: + # field-metric name, type-metric type(0-number,1-string), unit-metric unit('%','ms','MB'), instance-if is metrics group unique identifier + # field-指标名称, type-指标类型(0-number数字,1-string字符串), unit-指标单位('%','ms','MB'), instance-是否是指标集合唯一标识符字段 + - field: LoadedClassCount + type: 0 + - field: TotalLoadedClassCount + type: 0 + - field: UnloadedClassCount + type: 0 + protocol: jmx + jmx: + host: ^_^host^_^ + port: ^_^port^_^ + username: ^_^username^_^ + password: ^_^password^_^ + objectName: java.lang:type=ClassLoading + url: ^_^url^_^ + + - name: thread + priority: 7 + # collect metrics content + # 指标组中的具体监控指标 + fields: + # field-metric name, type-metric type(0-number,1-string), unit-metric unit('%','ms','MB'), instance-if is metrics group unique identifier + # field-指标名称, type-指标类型(0-number数字,1-string字符串), unit-指标单位('%','ms','MB'), instance-是否是指标集合唯一标识符字段 + - field: TotalStartedThreadCount + type: 0 + - field: ThreadCount + type: 0 + - field: PeakThreadCount + type: 0 + - field: DaemonThreadCount + type: 0 + - field: CurrentThreadUserTime + type: 0 + unit: s + - field: CurrentThreadCpuTime + type: 0 + unit: s + protocol: jmx + jmx: + host: ^_^host^_^ + port: ^_^port^_^ + username: ^_^username^_^ + password: ^_^password^_^ + objectName: java.lang:type=Threading + url: ^_^url^_^