Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kafka check - first pass #810

Merged
merged 15 commits into from
Mar 11, 2014
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 149 additions & 0 deletions conf.d/kafka.yaml.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
init_config:
# Metrics collected by this check. You should not have to modify this.
conf:
#
# Aggregate cluster stats
#
- include:
domain: '"kafka.server"'
bean: '"kafka.server":type="BrokerTopicMetrics",name="AllTopicsBytesOutPerSec"'
attribute:
MeanRate:
metric_type: gauge
alias: kafka.net.bytes_out
- include:
domain: '"kafka.server"'
bean: '"kafka.server":type="BrokerTopicMetrics",name="AllTopicsBytesInPerSec"'
attribute:
MeanRate:
metric_type: gauge
alias: kafka.net.bytes_in
- include:
domain: '"kafka.server"'
bean: '"kafka.server":type="BrokerTopicMetrics",name="AllTopicsMessagesInPerSec"'
attribute:
MeanRate:
metric_type: gauge
alias: kafka.messages_in

#
# Request timings
#
- include:
domain: '"kafka.server"'
bean: '"kafka.server":type="BrokerTopicMetrics",name="AllTopicsFailedFetchRequestsPerSec"'
attribute:
MeanRate:
metric_type: gauge
alias: kafka.request.fetch.failed
- include:
domain: '"kafka.server"'
bean: '"kafka.server":type="BrokerTopicMetrics",name="AllTopicsFailedProduceRequestsPerSec"'
attribute:
MeanRate:
metric_type: gauge
alias: kafka.request.produce.failed
- include:
domain: '"kafka.network"'
bean: '"kafka.network":type="RequestMetrics",name="Produce-TotalTimeMs"'
attribute:
Mean:
metric_type: counter
alias: kafka.request.produce.total_time.avg
99thPercentile:
metric_type: counter
alias: kafka.request.produce.total_time.99percentile
- include:
domain: '"kafka.network"'
bean: '"kafka.network":type="RequestMetrics",name="Fetch-TotalTimeMs"'
attribute:
Mean:
metric_type: counter
alias: kafka.request.fetch.total_time.avg
99thPercentile:
metric_type: counter
alias: kafka.request.fetch.total_time.99percentile
- include:
domain: '"kafka.network"'
bean: '"kafka.network":type="RequestMetrics",name="UpdateMetadata-TotalTimeMs"'
attribute:
Mean:
metric_type: counter
alias: kafka.request.update_metadata.total_time.avg
99thPercentile:
metric_type: counter
alias: kafka.request.update_metadata.total_time.99percentile
- include:
domain: '"kafka.network"'
bean: '"kafka.network":type="RequestMetrics",name="Metadata-TotalTimeMs"'
attribute:
Mean:
metric_type: counter
alias: kafka.request.metadata.total_time.avg
99thPercentile:
metric_type: counter
alias: kafka.request.metadata.total_time.99percentile
- include:
domain: '"kafka.network"'
bean: '"kafka.network":type="RequestMetrics",name="Offsets-TotalTimeMs"'
attribute:
Mean:
metric_type: counter
alias: kafka.request.offsets.total_time.avg
99thPercentile:
metric_type: counter
alias: kafka.request.offsets.total_time.99percentile

#
# Replication stats
#
- include:
domain: '"kafka.server"'
bean: '"kafka.server":type="ReplicaManager",name="ISRShrinksPerSec"'
attribute:
MeanRate:
metric_type: gauge
alias: kafka.replication.isr_shrinks
- include:
domain: '"kafka.server"'
bean: '"kafka.server":type="ReplicaManager",name="ISRExpandsPerSec"'
attribute:
MeanRate:
metric_type: gauge
alias: kafka.replication.isr_expands
- include:
domain: '"kafka.server"'
bean: '"kafka.server":type="ControllerStats",name="LeaderElectionRateAndTimeMs"'
attribute:
MeanRate:
metric_type: gauge
alias: kafka.replication.leader_elections
- include:
domain: '"kafka.server"'
bean: '"kafka.server":type="ControllerStats",name="UncleanLeaderElectionsPerSec"'
attribute:
MeanRate:
metric_type: gauge
alias: kafka.replication.unclean_leader_elections

#
# Log flush stats
#
- include:
domain: '"kafka.log"'
bean: '"kafka.log":type="LogFlushStats",name="LogFlushRateAndTimeMs"'
attribute:
MeanRate:
metric_type: gauge
alias: kafka.log.flush_rate


instances:
# - host: localhost
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick, you should add the instances section on top of the file so we don't have to scroll all the way down to configure a check.

# port: 9999
# name: jmx_instance
# user: username
# password: password
# #java_bin_path: /path/to/java #Optional, should be set if the agent cannot find your java executable
# #trust_store_path: /path/to/trustStore.jks # Optional, should be set if ssl is enabled
# #trust_store_password: password
36 changes: 20 additions & 16 deletions jmxfetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,15 @@
logging.WARNING : "WARN",
}

JMX_CHECKS = ['tomcat', 'activemq', 'activemq_58', 'solr', 'cassandra', 'jmx']
JMX_CHECKS = [
'activemq',
'activemq_58',
'cassandra',
'jmx',
'kafka',
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe you should use the is_jmx flag in the init_config. I think that's what we should use for newer jmx checks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is that read?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'solr',
'tomcat',
]
JMX_FETCH_JAR_NAME = "jmxfetch-0.2.0-jar-with-dependencies.jar"
JMX_LIST_COMMANDS = ['list_everything', 'list_collected_attributes', 'list_matching_attributes', 'list_not_matching_attributes', 'list_limited_attributes']
JMX_COLLECT_COMMAND = 'collect'
Expand All @@ -42,7 +50,7 @@ def init(cls, confd_path, agentConfig, logging_config, default_check_frequency,
JMXFetch.stop()

JMXFetch.start(confd_path, agentConfig, logging_config, java_bin_path, java_options, default_check_frequency, jmx_checks, command)
except Exception, e:
except Exception:
log.exception("Error while initiating JMXFetch")


Expand All @@ -51,11 +59,11 @@ def should_run(cls, confd_path):
"""
Return a tuple (jmx_checks, java_bin_path)

jmx_checks: list of yaml files that are jmx checks
jmx_checks: list of yaml files that are jmx checks
(they have the is_jmx flag enabled or they are in JMX_CHECKS)
and that have at least one instance configured

java_bin_path: is the path to the java executable. It was
java_bin_path: is the path to the java executable. It was
previously set in the "instance" part of the yaml file of the
jmx check. So we need to parse yaml files to get it.
We assume that this value is alwayws the same for every jmx check
Expand All @@ -71,10 +79,6 @@ def should_run(cls, confd_path):
java_options = None

for conf in glob.glob(os.path.join(confd_path, '*.yaml')):

java_bin_path_is_set = java_bin_path is not None
java_options_is_set = java_options is not None

check_name = os.path.basename(conf).split('.')[0]

if os.path.exists(conf):
Expand Down Expand Up @@ -134,7 +138,7 @@ def is_running(cls):
if get_os() != 'windows':
try:
os.kill(pid, 0)
# os.kill(pid, 0) will throw an exception if pid is not running
# os.kill(pid, 0) will throw an exception if pid is not running
# and won't do anything otherwise
# It doesn't work on windows as signal.CTRL_C_EVENT is 0, it would quit the process
return True
Expand Down Expand Up @@ -203,7 +207,7 @@ def start(cls, confd_path, agentConfig, logging_config, path_to_java, java_run_o
java_run_opts = java_run_opts or ""
path_to_jmxfetch = JMXFetch.get_path_to_jmxfetch()
path_to_status_file = os.path.join(tempfile.gettempdir(), "jmx_status.yaml")

subprocess_args = [
path_to_java, # Path to the java bin
'-jar',
Expand All @@ -213,8 +217,8 @@ def start(cls, confd_path, agentConfig, logging_config, path_to_java, java_run_o
'--log_level', JAVA_LOGGING_LEVEL.get(logging_config.get("log_level"), "INFO"), # Log Level: Mapping from Python log level to log4j log levels
'--log_location', r"%s" % logging_config.get('jmxfetch_log_file'), # Path of the log file
'--reporter', reporter, # Reporter to use
'--status_location', r"%s" % path_to_status_file, # Path to the status file to write
command, # Name of the command
'--status_location', r"%s" % path_to_status_file, # Path to the status file to write
command, # Name of the command
]

subprocess_args.insert(3, '--check')
Expand All @@ -233,11 +237,11 @@ def start(cls, confd_path, agentConfig, logging_config, path_to_java, java_run_o

else:
subprocess.call(subprocess_args)
except OSError, e:

except OSError:
jmx_connector_pid = None
log.exception("Couldn't launch JMXTerm. Is java in your PATH?")
except Exception, e:
except Exception:
jmx_connector_pid = None
log.exception("Couldn't launch JMXTerm")

Expand All @@ -248,7 +252,7 @@ def start(cls, confd_path, agentConfig, logging_config, path_to_java, java_run_o
fp.write(str(jmx_connector_pid))
fp.close()
os.chmod(JMXFetch.pid_file_path, 0644)
except Exception, e:
except Exception:
log.exception("Unable to write jmxfetch pidfile: %s" % JMXFetch.pid_file_path)