-
Notifications
You must be signed in to change notification settings - Fork 814
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Kafka check - first pass #810
Changes from 3 commits
ce83570
055f5f7
4e8b83d
aedb1ae
9675070
21013c9
e69a531
7afe93a
e217163
a226bf5
0a55914
c2449e1
98155bc
04777a4
84b2946
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
init_config: | ||
# Metrics collected by this check. You should not have to modify this. | ||
conf: | ||
# | ||
# Aggregate cluster stats | ||
# | ||
- include: | ||
domain: '"kafka.server"' | ||
bean: '"kafka.server":type="BrokerTopicMetrics",name="AllTopicsBytesOutPerSec"' | ||
attribute: | ||
MeanRate: | ||
metric_type: gauge | ||
alias: kafka.net.bytes_out | ||
- include: | ||
domain: '"kafka.server"' | ||
bean: '"kafka.server":type="BrokerTopicMetrics",name="AllTopicsBytesInPerSec"' | ||
attribute: | ||
MeanRate: | ||
metric_type: gauge | ||
alias: kafka.net.bytes_in | ||
- include: | ||
domain: '"kafka.server"' | ||
bean: '"kafka.server":type="BrokerTopicMetrics",name="AllTopicsMessagesInPerSec"' | ||
attribute: | ||
MeanRate: | ||
metric_type: gauge | ||
alias: kafka.messages_in | ||
|
||
# | ||
# Request timings | ||
# | ||
- include: | ||
domain: '"kafka.server"' | ||
bean: '"kafka.server":type="BrokerTopicMetrics",name="AllTopicsFailedFetchRequestsPerSec"' | ||
attribute: | ||
MeanRate: | ||
metric_type: gauge | ||
alias: kafka.request.fetch.failed | ||
- include: | ||
domain: '"kafka.server"' | ||
bean: '"kafka.server":type="BrokerTopicMetrics",name="AllTopicsFailedProduceRequestsPerSec"' | ||
attribute: | ||
MeanRate: | ||
metric_type: gauge | ||
alias: kafka.request.produce.failed | ||
- include: | ||
domain: '"kafka.network"' | ||
bean: '"kafka.network":type="RequestMetrics",name="Produce-TotalTimeMs"' | ||
attribute: | ||
Mean: | ||
metric_type: counter | ||
alias: kafka.request.produce.total_time.avg | ||
99thPercentile: | ||
metric_type: counter | ||
alias: kafka.request.produce.total_time.99percentile | ||
- include: | ||
domain: '"kafka.network"' | ||
bean: '"kafka.network":type="RequestMetrics",name="Fetch-TotalTimeMs"' | ||
attribute: | ||
Mean: | ||
metric_type: counter | ||
alias: kafka.request.fetch.total_time.avg | ||
99thPercentile: | ||
metric_type: counter | ||
alias: kafka.request.fetch.total_time.99percentile | ||
- include: | ||
domain: '"kafka.network"' | ||
bean: '"kafka.network":type="RequestMetrics",name="UpdateMetadata-TotalTimeMs"' | ||
attribute: | ||
Mean: | ||
metric_type: counter | ||
alias: kafka.request.update_metadata.total_time.avg | ||
99thPercentile: | ||
metric_type: counter | ||
alias: kafka.request.update_metadata.total_time.99percentile | ||
- include: | ||
domain: '"kafka.network"' | ||
bean: '"kafka.network":type="RequestMetrics",name="Metadata-TotalTimeMs"' | ||
attribute: | ||
Mean: | ||
metric_type: counter | ||
alias: kafka.request.metadata.total_time.avg | ||
99thPercentile: | ||
metric_type: counter | ||
alias: kafka.request.metadata.total_time.99percentile | ||
- include: | ||
domain: '"kafka.network"' | ||
bean: '"kafka.network":type="RequestMetrics",name="Offsets-TotalTimeMs"' | ||
attribute: | ||
Mean: | ||
metric_type: counter | ||
alias: kafka.request.offsets.total_time.avg | ||
99thPercentile: | ||
metric_type: counter | ||
alias: kafka.request.offsets.total_time.99percentile | ||
|
||
# | ||
# Replication stats | ||
# | ||
- include: | ||
domain: '"kafka.server"' | ||
bean: '"kafka.server":type="ReplicaManager",name="ISRShrinksPerSec"' | ||
attribute: | ||
MeanRate: | ||
metric_type: gauge | ||
alias: kafka.replication.isr_shrinks | ||
- include: | ||
domain: '"kafka.server"' | ||
bean: '"kafka.server":type="ReplicaManager",name="ISRExpandsPerSec"' | ||
attribute: | ||
MeanRate: | ||
metric_type: gauge | ||
alias: kafka.replication.isr_expands | ||
- include: | ||
domain: '"kafka.server"' | ||
bean: '"kafka.server":type="ControllerStats",name="LeaderElectionRateAndTimeMs"' | ||
attribute: | ||
MeanRate: | ||
metric_type: gauge | ||
alias: kafka.replication.leader_elections | ||
- include: | ||
domain: '"kafka.server"' | ||
bean: '"kafka.server":type="ControllerStats",name="UncleanLeaderElectionsPerSec"' | ||
attribute: | ||
MeanRate: | ||
metric_type: gauge | ||
alias: kafka.replication.unclean_leader_elections | ||
|
||
# | ||
# Log flush stats | ||
# | ||
- include: | ||
domain: '"kafka.log"' | ||
bean: '"kafka.log":type="LogFlushStats",name="LogFlushRateAndTimeMs"' | ||
attribute: | ||
MeanRate: | ||
metric_type: gauge | ||
alias: kafka.log.flush_rate | ||
|
||
|
||
instances: | ||
# - host: localhost | ||
# port: 9999 | ||
# name: jmx_instance | ||
# user: username | ||
# password: password | ||
# #java_bin_path: /path/to/java #Optional, should be set if the agent cannot find your java executable | ||
# #trust_store_path: /path/to/trustStore.jks # Optional, should be set if ssl is enabled | ||
# #trust_store_password: password |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,7 +21,15 @@ | |
logging.WARNING : "WARN", | ||
} | ||
|
||
JMX_CHECKS = ['tomcat', 'activemq', 'activemq_58', 'solr', 'cassandra', 'jmx'] | ||
JMX_CHECKS = [ | ||
'activemq', | ||
'activemq_58', | ||
'cassandra', | ||
'jmx', | ||
'kafka', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe you should use the is_jmx flag in the init_config. I think that's what we should use for newer jmx checks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where is that read? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
'solr', | ||
'tomcat', | ||
] | ||
JMX_FETCH_JAR_NAME = "jmxfetch-0.2.0-jar-with-dependencies.jar" | ||
JMX_LIST_COMMANDS = ['list_everything', 'list_collected_attributes', 'list_matching_attributes', 'list_not_matching_attributes', 'list_limited_attributes'] | ||
JMX_COLLECT_COMMAND = 'collect' | ||
|
@@ -42,7 +50,7 @@ def init(cls, confd_path, agentConfig, logging_config, default_check_frequency, | |
JMXFetch.stop() | ||
|
||
JMXFetch.start(confd_path, agentConfig, logging_config, java_bin_path, java_options, default_check_frequency, jmx_checks, command) | ||
except Exception, e: | ||
except Exception: | ||
log.exception("Error while initiating JMXFetch") | ||
|
||
|
||
|
@@ -51,11 +59,11 @@ def should_run(cls, confd_path): | |
""" | ||
Return a tuple (jmx_checks, java_bin_path) | ||
|
||
jmx_checks: list of yaml files that are jmx checks | ||
jmx_checks: list of yaml files that are jmx checks | ||
(they have the is_jmx flag enabled or they are in JMX_CHECKS) | ||
and that have at least one instance configured | ||
|
||
java_bin_path: is the path to the java executable. It was | ||
java_bin_path: is the path to the java executable. It was | ||
previously set in the "instance" part of the yaml file of the | ||
jmx check. So we need to parse yaml files to get it. | ||
We assume that this value is alwayws the same for every jmx check | ||
|
@@ -71,10 +79,6 @@ def should_run(cls, confd_path): | |
java_options = None | ||
|
||
for conf in glob.glob(os.path.join(confd_path, '*.yaml')): | ||
|
||
java_bin_path_is_set = java_bin_path is not None | ||
java_options_is_set = java_options is not None | ||
|
||
check_name = os.path.basename(conf).split('.')[0] | ||
|
||
if os.path.exists(conf): | ||
|
@@ -134,7 +138,7 @@ def is_running(cls): | |
if get_os() != 'windows': | ||
try: | ||
os.kill(pid, 0) | ||
# os.kill(pid, 0) will throw an exception if pid is not running | ||
# os.kill(pid, 0) will throw an exception if pid is not running | ||
# and won't do anything otherwise | ||
# It doesn't work on windows as signal.CTRL_C_EVENT is 0, it would quit the process | ||
return True | ||
|
@@ -203,7 +207,7 @@ def start(cls, confd_path, agentConfig, logging_config, path_to_java, java_run_o | |
java_run_opts = java_run_opts or "" | ||
path_to_jmxfetch = JMXFetch.get_path_to_jmxfetch() | ||
path_to_status_file = os.path.join(tempfile.gettempdir(), "jmx_status.yaml") | ||
|
||
subprocess_args = [ | ||
path_to_java, # Path to the java bin | ||
'-jar', | ||
|
@@ -213,8 +217,8 @@ def start(cls, confd_path, agentConfig, logging_config, path_to_java, java_run_o | |
'--log_level', JAVA_LOGGING_LEVEL.get(logging_config.get("log_level"), "INFO"), # Log Level: Mapping from Python log level to log4j log levels | ||
'--log_location', r"%s" % logging_config.get('jmxfetch_log_file'), # Path of the log file | ||
'--reporter', reporter, # Reporter to use | ||
'--status_location', r"%s" % path_to_status_file, # Path to the status file to write | ||
command, # Name of the command | ||
'--status_location', r"%s" % path_to_status_file, # Path to the status file to write | ||
command, # Name of the command | ||
] | ||
|
||
subprocess_args.insert(3, '--check') | ||
|
@@ -233,11 +237,11 @@ def start(cls, confd_path, agentConfig, logging_config, path_to_java, java_run_o | |
|
||
else: | ||
subprocess.call(subprocess_args) | ||
except OSError, e: | ||
|
||
except OSError: | ||
jmx_connector_pid = None | ||
log.exception("Couldn't launch JMXTerm. Is java in your PATH?") | ||
except Exception, e: | ||
except Exception: | ||
jmx_connector_pid = None | ||
log.exception("Couldn't launch JMXTerm") | ||
|
||
|
@@ -248,7 +252,7 @@ def start(cls, confd_path, agentConfig, logging_config, path_to_java, java_run_o | |
fp.write(str(jmx_connector_pid)) | ||
fp.close() | ||
os.chmod(JMXFetch.pid_file_path, 0644) | ||
except Exception, e: | ||
except Exception: | ||
log.exception("Unable to write jmxfetch pidfile: %s" % JMXFetch.pid_file_path) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nitpick, you should add the instances section on top of the file so we don't have to scroll all the way down to configure a check.