Implementing inode full detection support, closes #18

DevOps-Nirvana · Oct 17, 2023 · fce880d · fce880d
1 parent 7c538b3
commit fce880d
Show file tree

Hide file tree

Showing 4 changed files with 149 additions and 12 deletions.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,42 @@
+.PHONY: deps run run-hot start lint test-local help
+.DEFAULT_GOAL := help
+
+SHELL = bash
+
+# Install dependencies
+deps:
+	pip3 install -r requirements.txt
+
+# This is our default logic for "make run" or "make start", to use the backgrounded.  This is dry-run'd to prevent it from doing anything while developing
+run: deps
+	@echo -e "\n----- Starting service locally -----"
+	# NOTE: In here is where you can throw your secrets and such to avoid it from being committed
+	touch unused-local-envs.sh
+	source unused-local-envs.sh
+	DRY_RUN=true \
+	VERBOSE=true \
+	python3 main.py
+
+# Warning this will run it "hot" with no dry-run in place
+run-hot: deps
+	@echo -e "\n----- Starting service locally -----"
+	# NOTE: In here is where you can throw your secrets and such to avoid it from being committed
+	touch unused-local-envs.sh
+	source unused-local-envs.sh
+	python3 main.py
+
+# Alternate for "run"
+start: run
+
+# Lint our code
+lint: deps
+	black .
+
+test-local:
+	@echo -e "TODO - Add tests"
+
+help:
+	@echo -e "Makefile options possible\n------------------------------"
+	@echo -e "make deps    # Install dependencies"
+	@echo -e "make run     # Run service locally"
+	@echo -e "make start   # (alternate) Run service locally"
diff --git a/examples/simple-pod-with-pvc-inode.yaml b/examples/simple-pod-with-pvc-inode.yaml
@@ -0,0 +1,39 @@
+# This example below will create an PVC using the default StorageClass which you should
+# have configured to AllowVolumeExpansion set to True before using this.  When the pod
+# boots up it will automatically fill up the PVC disk, which should if you have the
+# volume autoscaler installed automatically expand the volume based on the default parameters
+#
+# Simply run: kubectl apply -f examples/simple-pod-with-pvc.yaml
+---
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: test-claim1
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 3G
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: test-claim1
+spec:
+  containers:
+  - name: write
+    image: alpine:latest
+    command: ["/bin/sh"]
+    args: ["-c", "cd /mnt/pv; i=0; while true; do touch \"file_$((i++))\"; done"]
+    volumeMounts:
+    - mountPath: "/mnt/pv"
+      name: test-volume
+    stdin: true
+    stdinOnce: true
+    tty: true
+  volumes:
+  - name: test-volume
+    persistentVolumeClaim:
+      claimName: test-claim1
+  restartPolicy: Never
diff --git a/helpers.py b/helpers.py
@@ -37,7 +37,7 @@ def detectPrometheusURL():
 DRY_RUN = True if getenv('DRY_RUN', "false").lower() == "true" else False        # If we want to dry-run this
 PROMETHEUS_LABEL_MATCH = getenv('PROMETHEUS_LABEL_MATCH') or ''                  # A PromQL label query to restrict volumes for this to see and scale, without braces.  eg: 'namespace="dev"'
 HTTP_TIMEOUT = int(getenv('HTTP_TIMEOUT', "15")) or 15                           # Allows to set the timeout for calls to Prometheus and Kubernetes.  This might be needed if your Prometheus or Kubernetes is over a remote WAN link with high latency and/or is heavily loaded
-PROMETHEUS_VERSION = "Unknown"                                                   # Used to detect the availability of a new function called present_over_time only available on Prometheus v2.30.0 or newer, this is auto-detected and updated, not set by a user
+PROMETHEUS_VERSION = "0.0.0"                                                     # Used to detect the availability of a new function called present_over_time only available on Prometheus v2.30.0 or newer, this is auto-detected and updated, not set by a user
 VERBOSE = True if getenv('VERBOSE', "false").lower() == "true" else False        # If we want to verbose mode
 VICTORIAMETRICS_COMPAT = True if getenv('VICTORIAMETRICS_MODE', "false").lower() == "true" else False # Whether to skip the prometheus check and assume victoriametrics
 SCOPE_ORGID_AUTH_HEADER = getenv('SCOPE_ORGID_AUTH_HEADER') or ''                # If we want to use Mimir or AgentMode which requires an orgid header.  See: https://grafana.com/docs/mimir/latest/references/http-api/#authentication
@@ -489,7 +489,36 @@ def fetch_pvcs_from_prometheus(url, label_match=PROMETHEUS_LABEL_MATCH):
             print("Prometheus Error: {}".format(response_object['error']))
             exit(-1)
 
-    return response_object['data']['result']
+    #TODO: Inject here "trying" to get inode percentage usage also
+    try:
+        if version.parse(PROMETHEUS_VERSION) >= version.parse("2.30.0"):
+            inodes_response = requests.get(url + '/api/v1/query', params={'query': "ceil((1 - kubelet_volume_stats_inodes_free{{ {} }} / kubelet_volume_stats_inodes)*100) and present_over_time(kubelet_volume_stats_inodes_free{{ {} }}[1h])".format(label_match,label_match)}, timeout=HTTP_TIMEOUT, headers=headers)
+        else:
+            inodes_response = requests.get(url + '/api/v1/query', params={'query': "ceil((1 - kubelet_volume_stats_inodes_free{{ {} }} / kubelet_volume_stats_inodes)*100)".format(label_match,label_match)}, timeout=HTTP_TIMEOUT, headers=headers)
+        inodes_response_object = inodes_response.json()
+
+        # Prepare values to merge/inject with our first response_object list/array above
+        inject_values = {}
+        for item in inodes_response_object['data']['result']:
+            ourkey = "{}_{}".format(item['metric']['namespace'], item['metric']['persistentvolumeclaim'])
+            inject_values[ourkey] = item['value'][1]
+
+        output_response_object = []
+        # Inject/merge them...
+        for item in response_object['data']['result']:
+            try:
+                ourkey = "{}_{}".format(item['metric']['namespace'], item['metric']['persistentvolumeclaim'])
+                if ourkey in inject_values:
+                    item['value_inodes'] = inject_values[ourkey]
+            except Exception as e:
+                print("Caught exception while trying to inject, please report me...")
+                print(e)
+            output_response_object.append(item)
+    except Exception as e:
+        print("Caught exception while trying to inject inode usage, please report me...")
+        print(e)
+
+    return output_response_object
 
 
 # Describe an specific PVC
@@ -550,13 +579,13 @@ def send_kubernetes_event(namespace, name, reason, message, type="Normal"):
 # Print a sexy human readable dict for volume
 def print_human_readable_volume_dict(input_dict):
     for key in input_dict:
-        print("    {}: {}".format(key.rjust(24), input_dict[key]), end='')
+        print("    {}: {}".format(key.rjust(25), input_dict[key]), end='')
         if key in ['volume_size_spec','volume_size_spec_bytes','volume_size_status','volume_size_status_bytes','scale_up_min_increment','scale_up_max_increment','scale_up_max_size'] and is_integer_or_float(input_dict[key]):
             print(" ({})".format(convert_bytes_to_storage(input_dict[key])), end='')
         if key in ['scale_cooldown_time']:
             print(" ({})".format(time.strftime('%H:%M:%S', time.gmtime(input_dict[key]))), end='')
         if key in ['last_resized_at']:
             print(" ({})".format(time.strftime('%Y-%m-%d %H:%M:%S %Z %z', time.localtime(input_dict[key]))), end='')
-        if key in ['scale_up_percent','scale_above_percent']:
+        if key in ['scale_up_percent','scale_above_percent','volume_used_percent','volume_used_inode_percent']:
             print("%", end='')
         print("") # Newline
diff --git a/main.py b/main.py
@@ -87,25 +87,37 @@
                 volume_namespace = str(item['metric']['namespace'])
                 volume_description = "{}.{}".format(item['metric']['namespace'], item['metric']['persistentvolumeclaim'])
                 volume_used_percent = int(item['value'][1])
+                pvcs_in_kubernetes[volume_description]['volume_used_percent'] = volume_used_percent
+                try:
+                    volume_used_inode_percent = int(item['value_inodes'])
+                except:
+                    volume_used_inode_percent = -1
+                pvcs_in_kubernetes[volume_description]['volume_used_inode_percent'] = volume_used_inode_percent
 
                 # Precursor check to ensure we have info for this pvc in kubernetes object
                 if volume_description not in pvcs_in_kubernetes:
                     print("ERROR: The volume {} was not found in Kubernetes but had metrics in Prometheus.  This may be an old volume, was just deleted, or some random jitter is occurring.  If this continues to occur, please report an bug.  You might also be using an older version of Prometheus, please make sure you're using v2.30.0 or newer before reporting a bug for this.".format(volume_description))
                     continue
 
                 if VERBOSE:
-                    print("Volume {} is {}% in-use of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status']))
                     print("  VERBOSE DETAILS:")
                     print("-------------------------------------------------------------------------------------------------------------")
                     print_human_readable_volume_dict(pvcs_in_kubernetes[volume_description])
                     print("-------------------------------------------------------------------------------------------------------------")
+                    print("Volume {} has {}% disk space used of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status']))
+                    if volume_used_inode_percent > -1:
+                        print("Volume {} has {}% inodes used".format(volume_description,volume_used_inode_percent))
 
                 # Check if we are NOT in an alert condition
-                if volume_used_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent']:
+                if volume_used_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent'] and volume_used_inode_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent']:
                     PROMETHEUS_METRICS['num_pvcs_below_threshold'].inc()
                     cache.unset(volume_description)
                     if VERBOSE:
-                        print(" and is not above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
+                        print("  and is not above {}% used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
+                        if volume_used_inode_percent > -1:
+                            print("  and is not above {}% inodes used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
+                    if VERBOSE:
+                        print("=============================================================================================================")
                     continue
                 else:
                     PROMETHEUS_METRICS['num_pvcs_above_threshold'].inc()
@@ -115,22 +127,30 @@
                     cache.set(volume_description, cache.get(volume_description) + 1)
                 else:
                     cache.set(volume_description, 1)
+
                 # Incase we aren't verbose, and didn't print this above, now that we're in alert we will print this
                 if not VERBOSE:
                     print("Volume {} is {}% in-use of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status']))
-                # Print the alert status
-                print("  BECAUSE it is above {}% used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
+                    print("Volume {} is {}% inode in-use".format(volume_description,volume_used_inode_percent))
+
+                # Print the alert status and reason
+                if volume_used_percent >= pvcs_in_kubernetes[volume_description]['scale_above_percent']:
+                    print("  BECAUSE it has space used above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
+                elif volume_used_inode_percent >= pvcs_in_kubernetes[volume_description]['scale_above_percent']:
+                    print("  BECAUSE it has inodes used above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
                 print("  ALERT has been for {} period(s) which needs to at least {} period(s) to scale".format(cache.get(volume_description), pvcs_in_kubernetes[volume_description]['scale_after_intervals']))
 
                 # Check if we are NOT in a possible scale condition
                 if cache.get(volume_description) < pvcs_in_kubernetes[volume_description]['scale_after_intervals']:
                     print("  BUT need to wait for {} intervals in alert before considering to scale".format( pvcs_in_kubernetes[volume_description]['scale_after_intervals'] ))
                     print("  FYI this has desired_size {} and current size {}".format( convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_spec_bytes']), convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes'])))
+                    print("=============================================================================================================")
                     continue
 
                 # If we are in a possible scale condition, check if we recently scaled it and handle accordingly
                 if pvcs_in_kubernetes[volume_description]['last_resized_at'] + pvcs_in_kubernetes[volume_description]['scale_cooldown_time'] >= int(time.mktime(time.gmtime())):
                     print("  BUT need to wait {} seconds to scale since the last scale time {} seconds ago".format( abs(pvcs_in_kubernetes[volume_description]['last_resized_at'] + pvcs_in_kubernetes[volume_description]['scale_cooldown_time']) - int(time.mktime(time.gmtime())), abs(pvcs_in_kubernetes[volume_description]['last_resized_at'] - int(time.mktime(time.gmtime()))) ))
+                    print("=============================================================================================================")
                     continue
 
                 # If we reach this far then we will be scaling the disk, all preconditions were passed from above
@@ -155,7 +175,7 @@
                     print("  Error/Exception while trying to determine what to resize to, volume causing failure:")
                     print("-------------------------------------------------------------------------------------------------------------")
                     print(pvcs_in_kubernetes[volume_description])
-                    print("-------------------------------------------------------------------------------------------------------------")
+                    print("=============================================================================================================")
                     continue
 
                 # If our resize bytes is less than our original size (because the user set the max-bytes to something too low)
@@ -169,33 +189,37 @@
                     print("-------------------------------------------------------------------------------------------------------------")
                     print(" Volume causing failure:")
                     print_human_readable_volume_dict(pvcs_in_kubernetes[volume_description])
-                    print("-------------------------------------------------------------------------------------------------------------")
+                    print("=============================================================================================================")
                     continue
 
                 # Check if we are already at the max volume size (either globally, or this-volume specific)
                 if resize_to_bytes == pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']:
                     print("  SKIPPING scaling this because we are at the maximum size of {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['scale_up_max_size'])))
+                    print("=============================================================================================================")
                     continue
 
                 # Check if we set on this PV we want to ignore the volume autoscaler
                 if pvcs_in_kubernetes[volume_description]['ignore']:
                     print("  IGNORING scaling this because the ignore annotation was set to true")
+                    print("=============================================================================================================")
                     continue
 
                 # Lets debounce this incase we did this resize last interval(s)
                 if cache.get(f"{volume_description}-has-been-resized"):
                     print("  DEBOUNCING and skipping this scaling, we resized within recent intervals")
+                    print("=============================================================================================================")
                     continue
 
                 # Check if we are DRY-RUN-ing and won't do anything
                 if DRY_RUN:
                     print("  DRY RUN was set, but we would have resized this disk from {} to {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), convert_bytes_to_storage(resize_to_bytes)))
+                    print("=============================================================================================================")
                     continue
 
                 # If we aren't dry-run, lets resize
                 PROMETHEUS_METRICS['resize_attempted'].inc()
                 print("  RESIZING disk from {} to {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), convert_bytes_to_storage(resize_to_bytes)))
-                status_output = "to scale up `{}` by `{}%` from `{}` to `{}`, it was using more than `{}%` disk space over the last `{} seconds`".format(
+                status_output = "to scale up `{}` by `{}%` from `{}` to `{}`, it was using more than `{}%` disk or inode space over the last `{} seconds`".format(
                     volume_description,
                     pvcs_in_kubernetes[volume_description]['scale_up_percent'],
                     convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']),
@@ -241,6 +265,9 @@
                 print(item)
                 traceback.print_exc()
 
+            if VERBOSE:
+                print("=============================================================================================================")
+
         # Wait until our next interval
         time.sleep(MAIN_LOOP_TIME)