* Update to MQ 9.3.4

* Fix Prometheus label cardinality when reporting disconnect (#245) * Add hints about monitoring "large" queue managers in TUNING.md * Make sure missing YAML configuration attributes have reasonable defaults * Update all vendored dependencies
ibm-messaging · Oct 19, 2023 · 1129560 · 1129560
1 parent 3706990
commit 1129560
Show file tree

Hide file tree

Showing 58 changed files with 11,478 additions and 832 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,2 +1,7 @@
-.git/
 .github/
+.git/
+
+# We use a file in the .git/refs directory to try to extract current commit level
+# so exclude that directory from the exclusions
+!.git/refs
+
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,13 @@
 # Changelog
 Newest updates are at the top of this file.
 
+### Oct 19 2023 (v5.5.1)
+* Update to MQ 9.3.4
+* Fix Prometheus label cardinality when reporting disconnect (#245)
+* Add hints about monitoring "large" queue managers in TUNING.md
+* Make sure missing YAML configuration attributes have reasonable defaults
+* Update all vendored dependencies
+
 ### Jun 20 2023 (v5.5.0)
 * Update to MQ 9.3.3
 * Update Dockerfile to support platforms without Redist client (#209)

diff --git a/Dockerfile b/Dockerfile
@@ -5,7 +5,7 @@
 # material from the build step into the runtime container.
 #
 # It can cope with both platforms where a Redistributable Client is available, and platforms
-# where it is not - copy the .deb install images for such platforms into the MQDEB
+# where it is not - copy the .deb install images for such platforms into the MQINST
 # subdirectory of this repository first.
 
 # Global ARG. To be used in all stages.
@@ -21,7 +21,7 @@ ARG EXPORTER
 ENV EXPORTER=${EXPORTER} \
     ORG="github.com/ibm-messaging" \
     REPO="mq-metric-samples" \
-    VRMF=9.3.3.0 \
+    VRMF=9.3.4.0 \
     CGO_CFLAGS="-I/opt/mqm/inc/" \
     CGO_LDFLAGS_ALLOW="-Wl,-rpath.*" \
     genmqpkg_incnls=1 \
@@ -39,26 +39,31 @@ RUN mkdir -p /go/src /go/bin /go/pkg \
     && chmod -R 777 /go \
     && mkdir -p /go/src/$ORG \
     && mkdir -p /opt/mqm \
-    && mkdir -p /MQDEB \
+    && mkdir -p /MQINST \
     && chmod a+rx /opt/mqm
 
 # Install MQ client and SDK
 # For platforms with a Redistributable client, we can use curl to pull it in and unpack it.
-# For other platforms, we assume that you have the deb files available under the current directory
+# For most other platforms, we assume that you have deb files available under the current directory
 # and we then copy them into the container image. Use dpkg to install from them; these have to be 
-# done in the right order.
+# done in the right order. 
+#
+# The Linux ARM64 image is a full-function server package that is directly unpacked. 
+# We only need a subset of the files so strip the unneeded filesets. The download of the image could
+# be automated via curl in the same way as the Linux/amd64 download, but it's a much bigger image and
+# has a different license. So I'm not going to do that for now.
 # 
 # If additional Redistributable Client platforms appear, then this block can be altered, including the MQARCH setting.
 # 
 # The copy of the README is so that at least one file always gets copied, even if you don't have the deb files locally. 
-# Using a wildcard in the directory name also helps to ensure that this part of the build should always succeed.
-COPY README.md MQDEB*/*deb /MQDEB
-
-# This is a value always set by the "docker build" process
-ARG TARGETPLATFORM
-RUN echo "Target arch is $TARGETPLATFORM"
-# Might need to refer to TARGETPLATFORM a few times in this block, so define something shorter.
-RUN T="$TARGETPLATFORM"; \
+# Using a wildcard in the directory name also helps to ensure that this part of the build always succeeds.
+COPY README.md MQINST*/*deb MQINST*/*tar.gz /MQINST
+
+# These are values always set by the "docker build" process
+ARG TARGETARCH TARGETOS
+RUN echo "Target arch is $TARGETARCH; os is $TARGETOS"
+# Might need to refer to TARGET* vars a few times in this block, so define something shorter.
+RUN T="$TARGETOS/$TARGETARCH"; \
       if [ "$T" = "linux/amd64" ]; \
       then \
         MQARCH=X64;\
@@ -69,35 +74,48 @@ RUN T="$TARGETPLATFORM"; \
         && tar -zxf ./*.tar.gz \
         && rm -f ./*.tar.gz \
         && bin/genmqpkg.sh -b /opt/mqm;\
+      elif [ "$T" = "linux/arm64" ] ;\
+      then \
+        cd /MQINST; \
+        c=`ls *$VRMF*.tar.gz 2>/dev/null| wc -l`; if [ $c -ne 1 ]; then echo "MQ installation file does not exist in MQINST subdirectory";exit 1;fi; \
+        cd /opt/mqm \
+        && tar -zxf /MQINST/*.tar.gz \
+        && export genmqpkg_incserver=0 \
+        && bin/genmqpkg.sh -b /opt/mqm;\
       elif [ "$T" = "linux/ppc64le" -o "$T" = "linux/s390x" ];\
       then \
-        cd /MQDEB; \
-        c=`ls ibmmq-*$VRMF*.deb| wc -l`; if [ $c -lt 4 ]; then echo "MQ installation files do not exist in MQDEB subdirectory";exit 1;fi; \
+        cd /MQINST; \
+        c=`ls ibmmq-*$VRMF*.deb 2>/dev/null| wc -l`; if [ $c -lt 4 ]; then echo "MQ installation files do not exist in MQINST subdirectory";exit 1;fi; \
         for f in ibmmq-runtime_$VRMF*.deb ibmmq-gskit_$VRMF*.deb ibmmq-client_$VRMF*.deb ibmmq-sdk_$VRMF*.deb; do dpkg -i $f;done; \
       else   \
         echo "Unsupported platform $T";\
         exit 1;\
       fi
 
-# Build Go application
+# Build the Go application
 WORKDIR /go/src/$ORG/$REPO
 COPY go.mod .
 COPY go.sum .
 COPY --chmod=777 ./cmd/${EXPORTER} .
 COPY --chmod=777 vendor ./vendor
 COPY --chmod=777 pkg ./pkg
-RUN go build -mod=vendor -o /go/bin/${EXPORTER} ./*.go
+# This file holds something like the current commit level if it exists in your tree. It might not be there, so
+# we use wildcards to avoid errors on non-existent files/dirs.
+COPY --chmod=777 ./.git*/refs/heads/master* .
+RUN buildStamp=`date +%Y%m%d-%H%M%S`; \
+    hw=`uname -m`; \
+    os=`uname -s`; \
+    bp="$os/$hw"; \
+    if [ -r master ]; then gitCommit=`cat master`;else gitCommit="Unknown";fi; \
+    BUILD_EXTRA_INJECT="-X \"main.BuildStamp=$buildStamp\" -X \"main.BuildPlatform=$bp\" -X \"main.GitCommit=$gitCommit\""; \
+    go build -mod=vendor -ldflags "$BUILD_EXTRA_INJECT" -o /go/bin/${EXPORTER} ./*.go
 
 # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
 ### ### ### ### ### ### ### RUN ### ### ### ### ### ### ###
 # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
 FROM golang:1.19 AS runtime
 
 ARG EXPORTER
-ENV EXPORTER=${EXPORTER} \
-    LD_LIBRARY_PATH="/opt/mqm/lib64:/usr/lib64" \
-    MQ_CONNECT_TYPE=CLIENT \
-    IBMMQ_GLOBAL_CONFIGURATIONFILE=/opt/config/${EXPORTER}.yaml
 
 # Create directory structure
 RUN mkdir -p /opt/bin \
@@ -120,6 +138,11 @@ RUN mkdir -p /IBM/MQ/data/errors \
     && chmod -R 777 /IBM \
     && chmod -R 777 /.mqm
 
+ENV EXPORTER=${EXPORTER} \
+    LD_LIBRARY_PATH="/opt/mqm/lib64:/usr/lib64" \
+    MQ_CONNECT_TYPE=CLIENT \
+    IBMMQ_GLOBAL_CONFIGURATIONFILE=/opt/config/${EXPORTER}.yaml
+
 COPY --chmod=555 --from=builder /go/bin/${EXPORTER} /opt/bin/${EXPORTER}
 COPY             --from=builder /opt/mqm/ /opt/mqm/
 

diff --git a/Dockerfile.build b/Dockerfile.build
@@ -16,7 +16,7 @@ ARG BASE_IMAGE=ubuntu:20.04
 FROM $BASE_IMAGE
 
 ARG GOPATH_ARG="/go"
-ARG GOVERSION=1.17        
+ARG GOVERSION=1.19        
 ARG GOARCH=amd64
 ARG MQARCH=X64
 
@@ -61,7 +61,7 @@ RUN mkdir -p $GOPATH/src $GOPATH/bin $GOPATH/pkg \
 # Location of the downloadable MQ client package \
 ENV RDURL="https://public.dhe.ibm.com/ibmdl/export/pub/software/websphere/messaging/mqdev/redist" \
     RDTAR="IBM-MQC-Redist-Linux${MQARCH}.tar.gz" \
-    VRMF=9.3.3.0
+    VRMF=9.3.4.0
 
 # Install the MQ client from the Redistributable package. This also contains the
 # header files we need to compile against. Setup the subset of the package
@@ -77,24 +77,19 @@ RUN cd /opt/mqm \
  && bin/genmqpkg.sh -b /opt/mqm
 
 # Insert the script that will do the build
-COPY scripts/buildInDocker.sh $GOPATH
-RUN chmod 777 $GOPATH/buildInDocker.sh
+COPY --chmod=777 scripts/buildInDocker.sh $GOPATH
 
 WORKDIR $GOPATH/src/$ORG/$REPO
-COPY go.mod .
-COPY go.sum .
-RUN chmod 777 go.*
+COPY --chmod=777 go.mod .
+COPY --chmod=777 go.sum .
+COPY --chmod=777 config.common.yaml .
 
-COPY config.common.yaml .
-RUN chmod 777 config.common.yaml
-
-#RUN /usr/lib/go-${GOVERSION}/bin/go mod download
+# RUN /usr/lib/go-${GOVERSION}/bin/go mod download
 
 # Copy the rest of the source tree from this directory into the container and
 # make sure it's readable by the user running the container
 ENV  REPO="mq-metric-samples"
-COPY . $GOPATH/src/$ORG/$REPO
-RUN chmod -R a+rwx $GOPATH/src/$ORG/$REPO
+COPY --chmod=777 . $GOPATH/src/$ORG/$REPO
 
 # Set the entrypoint to the script that will do the compilation
 ENTRYPOINT $GOPATH/buildInDocker.sh
diff --git a/Dockerfile.run b/Dockerfile.run
@@ -34,7 +34,7 @@ RUN apt-get update \
 # Location of the downloadable MQ client package \
 ENV RDURL="https://public.dhe.ibm.com/ibmdl/export/pub/software/websphere/messaging/mqdev/redist" \
     RDTAR="IBM-MQC-Redist-Linux${MQARCH}.tar.gz" \
-    VRMF=9.3.3.0
+    VRMF=9.3.4.0
 
 # Install the MQ client from the Redistributable package. This also contains the
 # header files we need to compile against. Setup the subset of the package

diff --git a/MQDEB/README b/MQDEB/README
diff --git a/MQINST/README b/MQINST/README
@@ -0,0 +1,9 @@
+If you are using the Dockerfile in the root of this repository
+to build and run an exporter/collector program:
+
+For Linux platforms without a Redistributable Client package, but
+with full install packages, copy the .deb installation files into 
+this directory.
+
+For Linux/Arm64, copy the .tar.gz installation file into this 
+directory.
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ file if you wish to reload all of the dependencies by running `go mod vendor`.
 
 You will require the following programs:
 
-* Go compiler - version 1.17 is the minimum defined here
+* Go compiler - version 1.19 is the minimum defined here
 * C compiler
 
 
@@ -88,9 +88,9 @@ containers. You still need to provide the configuration file at runtime, perhaps
 ```  
 
 ### Platform support
-This Dockerfile should work for a variety of platforms. For those with a Redistributable client, it uses
-`curl` to automatically download and unpack the required MQ files. For other platforms, it assumes that
-you have an `MQDEB` subdirectory under this root, and then copied the `.deb` files from your
+This Dockerfile should work for a variety of platforms. For those with a Redistributable client, it uses `curl` to
+automatically download and unpack the required MQ files. For other platforms, it assumes that you have an `MQINST`
+subdirectory under this root, and then copied the `.deb` files (or the `.tar.gz` file for Linux/arm64 systems) from your
 real MQ installation tree into it.
 
 ### Additional container scripts

diff --git a/TUNING.md b/TUNING.md
@@ -0,0 +1,121 @@
+# Tuning hints for monitoring "large" queue managers
+
+If you have a large queue manager - perhaps several thousands of queues - then a lot of data could be produced for
+monitoring those queues. Some default configuration options might need tuning to get acceptable performance. Reducing
+the frequency of generation and/or collection may be appropriate. There may be several places where tuning might be
+done: in this collector, in the database configuration, and in the queue manager.
+
+The following sections describe different pieces that you might want to look at. 
+
+The document is mostly written from the viewpoint of using Prometheus as the database. That is mainly because
+Prometheus has the unique "pull" model, where the server calls the collector at configured intervals. Other databases and
+collector technologies supported from this repository have a simpler way of "pushing" data to the various backends.
+However much of the document is relevant regardless of where the metrics end up.
+
+## Collector location
+It is most efficient to run the collector program as a local bindings application, connecting directly to the queue
+manager. That removes all the MQ Client flows that would have to be done for every message.
+
+If you cannot avoid running as a client (for example, you are trying to monitor the MQ Appliance or z/OS), then keep the
+network latency between the queue manager and collector as low as possible. For z/OS, you might consider running the
+collector in a zLinux LPAR on the same machine. Or perhaps in a zCX container.
+
+Also configure the client to take advantage of readahead when getting publications. This is done by setting
+`DEFREADA(YES)` on the nominated ReplyQueue(s).
+
+## Collection processing time
+The collector reports on how long it takes to collect and process the data on each interval. You can see this in a debug
+log. The Prometheus collector also has a `ibmmq_qmgr_exporter_collection_time` metric. Note that this time is the value
+as seen by the main collection thread; the real total time as seen by Prometheus is usually longer. There is likely
+still work going on in the background to send metrics to the database, and for it to be successfully ingested.
+
+The first time that the collection time exceeds the Prometheus default `scrape_timeout` value, a warning message is
+emitted. This can be ignored if you are expecting a scrape to take a longer period. But it can be helpful if you didn't
+know that you might need to do some tuning.
+
+The true total time taken for a scrape can be seen in Prometheus directly. For example, you can use the admininistrative
+interface at `http://<server>:9090/targets?search=` and find the target corresponding to your queue manager.
+
+For other collectors, there is no specific metric. But the timestamps on each collection block allow you to deduce the
+time taken as the difference between successive iterations is the collection period plus the `interval` configuration
+value.
+
+## Ensuring collection intervals have enough time to run
+The Prometheus `scrape_configs` configuration attributes can be configured for all or some collectors. In particular,
+you will probably want to change the `scrape_interval` and `scrape_timeout` values for the jobs associated with large
+queue managers. Use the reported collection processing time as a basis from which to set these values.
+
+For other collector models, the collector-specific `interval` attribute determines the gap between each push of the
+metrics. There is no "maximum" collection time.
+
+## Reducing metric publication interval from queue manager
+By default, the queue manager publishes resource metrics every 10 seconds. This matches fairly well with the Prometheus
+default scrape interval of 15s. But if you increase the scrape interval, you might also want to reduce the frequency of
+publications so that fewer "merges" have to be done when processing the subscription destination queues. Setting the
+following stanza in the _qm.ini_ file changes that frequency:
+```
+   TuningParameters:
+     MonitorPublishHeartBeat = 30
+```
+This value is given in seconds. And the attribute is case-sensitive. As increasing the value reduces the frequency of
+generation, it may cause you to miss shorter-lived transient spikes in some values. That's the tradeoff you have to
+evaluate. But having a value smaller than the time taken to process the publications might result in a never-ending
+scrape. The publication-processing portion of the scrape can be seen in a debug log.
+
+## Reducing subscriptions made to queue manager
+Reducing the total number of subscriptions made will reduce the data that needs to be processed. But at the cost of
+missing some metrics that you might find useful. See also the section in the [README](README.md) file about using
+durable subscriptions.
+
+* You can disable all use of published resource metrics, and rely on the `DISPLAY xxSTATUS` responses. This clearly
+  reduces the data, but you lose out on many useful metrics. It is essentially how we monitor z/OS queue managers as
+  they do not have the publication model for metrics. But if you want this approach, set the `global.usePublications`
+  configuration option to `false`
+
+* You can reduce the total number of subscriptions made for queue metrics. The `filters.queueSubscriptionSelector` list
+  defines the sets of topics that you might be interested in. The complete set - for now - is
+  [OPENCLOSE, INQSET, PUT, GET, GENERAL]. In many cases, only the last three of these may be of interest. The smaller
+  set reduces the number of publications per queue. Within each set, multiple metrics are created but there is no way to
+  report on only a subset of the metrics in each set.
+
+* You can choose to not subscribe to any queue metrics, but still subscribe to metrics for other resources such as the
+  queue manager and Native HA by setting the filter to `NONE`. If you do this, then many queue metrics become
+  unavailable. However, the current queue depth will still be available as it can also be determined from the
+  `DISPLAY QSTATUS` response.
+
+## Reducing the number of monitored objects and status requests
+Each object type (queues, channels etc) has a block in the collector configuration that names which objects should be
+monitored. While both positive and negative wildcards can be used in these blocks, it is probably most efficient to use
+only positive wildcards. That allows the `DISPLAY xxSTATUS` requests to pass the wildcards directly into the queue
+manager commands; if there are any negative patterns, the collector has to work out which objects match the pattern, and
+then inquire for the remainder individually.
+
+## Other configuration options
+The `global.pollInterval` and `global.rediscoverInterval` options may help to further reduce inquiries.
+
+The first of these controls how frequently the `DISPLAY xxSTATUS` commands are used, assuming the
+`global.useObjectStatus` is `true`. In some circumstances, you might not want all of the responses as regularly as the
+published metrics are handled.
+
+The second attribute controls how frequently the collector reassesses the list of objects to be monitored, and their
+more stable attributes. For example, the `DESCRIPTION` or `MAXDEPTH` settings on a queue. If you have a large number of
+queues that do not change frequently, then you might want to increase the rediscovery attribute. The default is 1 hour.
+The tradeoff here is that newly-defined queues may not have any metrics reported until this interval expires.
+
+## Dividing the workload
+One further approach that you might like to consider, though I wouldn't usually recommend it, is to have two or more
+collectors running against the same queue manager. And then configure different sets of queues to be monitored. So a
+collector listening on port 9157 might manage queues A*-M*, while another collector on port 9158 monitors queues N*-Z*.
+You would likely need additional configuration to reduce duplication of other components, for example by using the
+`jobname` or `instance` as a filter element on dashboard queries, but it might be one way to reduce the time taken for a
+single scrape.
+
+## Very slow queue managers
+The collectors wait for a short time for each response to a status request. If the timeout expires with no expected
+message appearing, then an error is reported. Some queue managers - particuarly when hosted in cloud services - have
+appeared to "stall" for a period. Even though they are not especially busy, the response messages have not appeared in
+time. The default wait of 3 seconds can be tuned using the `connection.waitInterval` option.
+
+For all collectors _except_ Prometheus, a small number of these timeout errors are permitted consecutively. The failure
+count is reset after a successful collection. See _pkg/errors/errors.go_ for details. The Prometheus collector has an
+automatic reconnect option after failures, so does not currently use this strategy.
diff --git a/cmd/mq_json/main.go b/cmd/mq_json/main.go
@@ -47,7 +47,7 @@ func printInfo(title string, stamp string, commit string, buildPlatform string)
 	if buildPlatform != "" {
 		log.Infoln("Build Platform: " + buildPlatform)
 	}
-	log.Infoln("MQ Go Version : " + cf.MqGolangVersion)
+	log.Infoln("MQ Go Version : " + cf.MqGolangVersion())
 	log.Println("")
 }
 

diff --git a/cmd/mq_prometheus/README.md b/cmd/mq_prometheus/README.md
@@ -111,4 +111,4 @@ to continually attempt to reconnect with the `keepRunning` parameter (provided t
 and successfully connected once). In this
 mode, the web server called by Prometheus to give the metrics continues to run. It will return a single
 metric `qmgr_status` indicating that the queue manager is down. This may be the preferred
-execution model when the collector is not running as a queue manager service.
+execution model when the collector is not running as a queue manager service.