From fce880da95ba622a352e23c5b8340287eb5eb8c8 Mon Sep 17 00:00:00 2001
From: Farley <farley@neonsurge.com>
Date: Tue, 17 Oct 2023 22:06:15 +1300
Subject: [PATCH] Implementing inode full detection support, closes #18

---
 Makefile                                | 42 ++++++++++++++++++++++++
 examples/simple-pod-with-pvc-inode.yaml | 39 ++++++++++++++++++++++
 helpers.py                              | 37 ++++++++++++++++++---
 main.py                                 | 43 ++++++++++++++++++++-----
 4 files changed, 149 insertions(+), 12 deletions(-)
 create mode 100644 Makefile
 create mode 100644 examples/simple-pod-with-pvc-inode.yaml

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e57b245
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,42 @@
+.PHONY: deps run run-hot start lint test-local help
+.DEFAULT_GOAL := help
+
+SHELL = bash
+
+# Install dependencies
+deps:
+	pip3 install -r requirements.txt
+
+# This is our default logic for "make run" or "make start", to use the backgrounded.  This is dry-run'd to prevent it from doing anything while developing
+run: deps
+	@echo -e "\n----- Starting service locally -----"
+	# NOTE: In here is where you can throw your secrets and such to avoid it from being committed
+	touch unused-local-envs.sh
+	source unused-local-envs.sh
+	DRY_RUN=true \
+	VERBOSE=true \
+	python3 main.py
+
+# Warning this will run it "hot" with no dry-run in place
+run-hot: deps
+	@echo -e "\n----- Starting service locally -----"
+	# NOTE: In here is where you can throw your secrets and such to avoid it from being committed
+	touch unused-local-envs.sh
+	source unused-local-envs.sh
+	python3 main.py
+
+# Alternate for "run"
+start: run
+
+# Lint our code
+lint: deps
+	black .
+
+test-local:
+	@echo -e "TODO - Add tests"
+
+help:
+	@echo -e "Makefile options possible\n------------------------------"
+	@echo -e "make deps    # Install dependencies"
+	@echo -e "make run     # Run service locally"
+	@echo -e "make start   # (alternate) Run service locally"
diff --git a/examples/simple-pod-with-pvc-inode.yaml b/examples/simple-pod-with-pvc-inode.yaml
new file mode 100644
index 0000000..a05e3b6
--- /dev/null
+++ b/examples/simple-pod-with-pvc-inode.yaml
@@ -0,0 +1,39 @@
+# This example below will create an PVC using the default StorageClass which you should
+# have configured to AllowVolumeExpansion set to True before using this.  When the pod
+# boots up it will automatically fill up the PVC disk, which should if you have the
+# volume autoscaler installed automatically expand the volume based on the default parameters
+#
+# Simply run: kubectl apply -f examples/simple-pod-with-pvc.yaml
+---
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: test-claim1
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 3G
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: test-claim1
+spec:
+  containers:
+  - name: write
+    image: alpine:latest
+    command: ["/bin/sh"]
+    args: ["-c", "cd /mnt/pv; i=0; while true; do touch \"file_$((i++))\"; done"]
+    volumeMounts:
+    - mountPath: "/mnt/pv"
+      name: test-volume
+    stdin: true
+    stdinOnce: true
+    tty: true
+  volumes:
+  - name: test-volume
+    persistentVolumeClaim:
+      claimName: test-claim1
+  restartPolicy: Never
diff --git a/helpers.py b/helpers.py
index e67ee22..5bd450e 100644
--- a/helpers.py
+++ b/helpers.py
@@ -37,7 +37,7 @@ def detectPrometheusURL():
 DRY_RUN = True if getenv('DRY_RUN', "false").lower() == "true" else False        # If we want to dry-run this
 PROMETHEUS_LABEL_MATCH = getenv('PROMETHEUS_LABEL_MATCH') or ''                  # A PromQL label query to restrict volumes for this to see and scale, without braces.  eg: 'namespace="dev"'
 HTTP_TIMEOUT = int(getenv('HTTP_TIMEOUT', "15")) or 15                           # Allows to set the timeout for calls to Prometheus and Kubernetes.  This might be needed if your Prometheus or Kubernetes is over a remote WAN link with high latency and/or is heavily loaded
-PROMETHEUS_VERSION = "Unknown"                                                   # Used to detect the availability of a new function called present_over_time only available on Prometheus v2.30.0 or newer, this is auto-detected and updated, not set by a user
+PROMETHEUS_VERSION = "0.0.0"                                                     # Used to detect the availability of a new function called present_over_time only available on Prometheus v2.30.0 or newer, this is auto-detected and updated, not set by a user
 VERBOSE = True if getenv('VERBOSE', "false").lower() == "true" else False        # If we want to verbose mode
 VICTORIAMETRICS_COMPAT = True if getenv('VICTORIAMETRICS_MODE', "false").lower() == "true" else False # Whether to skip the prometheus check and assume victoriametrics
 SCOPE_ORGID_AUTH_HEADER = getenv('SCOPE_ORGID_AUTH_HEADER') or ''                # If we want to use Mimir or AgentMode which requires an orgid header.  See: https://grafana.com/docs/mimir/latest/references/http-api/#authentication
@@ -489,7 +489,36 @@ def fetch_pvcs_from_prometheus(url, label_match=PROMETHEUS_LABEL_MATCH):
             print("Prometheus Error: {}".format(response_object['error']))
             exit(-1)
 
-    return response_object['data']['result']
+    #TODO: Inject here "trying" to get inode percentage usage also
+    try:
+        if version.parse(PROMETHEUS_VERSION) >= version.parse("2.30.0"):
+            inodes_response = requests.get(url + '/api/v1/query', params={'query': "ceil((1 - kubelet_volume_stats_inodes_free{{ {} }} / kubelet_volume_stats_inodes)*100) and present_over_time(kubelet_volume_stats_inodes_free{{ {} }}[1h])".format(label_match,label_match)}, timeout=HTTP_TIMEOUT, headers=headers)
+        else:
+            inodes_response = requests.get(url + '/api/v1/query', params={'query': "ceil((1 - kubelet_volume_stats_inodes_free{{ {} }} / kubelet_volume_stats_inodes)*100)".format(label_match,label_match)}, timeout=HTTP_TIMEOUT, headers=headers)
+        inodes_response_object = inodes_response.json()
+
+        # Prepare values to merge/inject with our first response_object list/array above
+        inject_values = {}
+        for item in inodes_response_object['data']['result']:
+            ourkey = "{}_{}".format(item['metric']['namespace'], item['metric']['persistentvolumeclaim'])
+            inject_values[ourkey] = item['value'][1]
+
+        output_response_object = []
+        # Inject/merge them...
+        for item in response_object['data']['result']:
+            try:
+                ourkey = "{}_{}".format(item['metric']['namespace'], item['metric']['persistentvolumeclaim'])
+                if ourkey in inject_values:
+                    item['value_inodes'] = inject_values[ourkey]
+            except Exception as e:
+                print("Caught exception while trying to inject, please report me...")
+                print(e)
+            output_response_object.append(item)
+    except Exception as e:
+        print("Caught exception while trying to inject inode usage, please report me...")
+        print(e)
+
+    return output_response_object
 
 
 # Describe an specific PVC
@@ -550,13 +579,13 @@ def send_kubernetes_event(namespace, name, reason, message, type="Normal"):
 # Print a sexy human readable dict for volume
 def print_human_readable_volume_dict(input_dict):
     for key in input_dict:
-        print("    {}: {}".format(key.rjust(24), input_dict[key]), end='')
+        print("    {}: {}".format(key.rjust(25), input_dict[key]), end='')
         if key in ['volume_size_spec','volume_size_spec_bytes','volume_size_status','volume_size_status_bytes','scale_up_min_increment','scale_up_max_increment','scale_up_max_size'] and is_integer_or_float(input_dict[key]):
             print(" ({})".format(convert_bytes_to_storage(input_dict[key])), end='')
         if key in ['scale_cooldown_time']:
             print(" ({})".format(time.strftime('%H:%M:%S', time.gmtime(input_dict[key]))), end='')
         if key in ['last_resized_at']:
             print(" ({})".format(time.strftime('%Y-%m-%d %H:%M:%S %Z %z', time.localtime(input_dict[key]))), end='')
-        if key in ['scale_up_percent','scale_above_percent']:
+        if key in ['scale_up_percent','scale_above_percent','volume_used_percent','volume_used_inode_percent']:
             print("%", end='')
         print("") # Newline
diff --git a/main.py b/main.py
index 0216ede..a18ccbb 100755
--- a/main.py
+++ b/main.py
@@ -87,6 +87,12 @@
                 volume_namespace = str(item['metric']['namespace'])
                 volume_description = "{}.{}".format(item['metric']['namespace'], item['metric']['persistentvolumeclaim'])
                 volume_used_percent = int(item['value'][1])
+                pvcs_in_kubernetes[volume_description]['volume_used_percent'] = volume_used_percent
+                try:
+                    volume_used_inode_percent = int(item['value_inodes'])
+                except:
+                    volume_used_inode_percent = -1
+                pvcs_in_kubernetes[volume_description]['volume_used_inode_percent'] = volume_used_inode_percent
 
                 # Precursor check to ensure we have info for this pvc in kubernetes object
                 if volume_description not in pvcs_in_kubernetes:
@@ -94,18 +100,24 @@
                     continue
 
                 if VERBOSE:
-                    print("Volume {} is {}% in-use of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status']))
                     print("  VERBOSE DETAILS:")
                     print("-------------------------------------------------------------------------------------------------------------")
                     print_human_readable_volume_dict(pvcs_in_kubernetes[volume_description])
                     print("-------------------------------------------------------------------------------------------------------------")
+                    print("Volume {} has {}% disk space used of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status']))
+                    if volume_used_inode_percent > -1:
+                        print("Volume {} has {}% inodes used".format(volume_description,volume_used_inode_percent))
 
                 # Check if we are NOT in an alert condition
-                if volume_used_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent']:
+                if volume_used_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent'] and volume_used_inode_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent']:
                     PROMETHEUS_METRICS['num_pvcs_below_threshold'].inc()
                     cache.unset(volume_description)
                     if VERBOSE:
-                        print(" and is not above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
+                        print("  and is not above {}% used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
+                        if volume_used_inode_percent > -1:
+                            print("  and is not above {}% inodes used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
+                    if VERBOSE:
+                        print("=============================================================================================================")
                     continue
                 else:
                     PROMETHEUS_METRICS['num_pvcs_above_threshold'].inc()
@@ -115,22 +127,30 @@
                     cache.set(volume_description, cache.get(volume_description) + 1)
                 else:
                     cache.set(volume_description, 1)
+
                 # Incase we aren't verbose, and didn't print this above, now that we're in alert we will print this
                 if not VERBOSE:
                     print("Volume {} is {}% in-use of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status']))
-                # Print the alert status
-                print("  BECAUSE it is above {}% used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
+                    print("Volume {} is {}% inode in-use".format(volume_description,volume_used_inode_percent))
+
+                # Print the alert status and reason
+                if volume_used_percent >= pvcs_in_kubernetes[volume_description]['scale_above_percent']:
+                    print("  BECAUSE it has space used above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
+                elif volume_used_inode_percent >= pvcs_in_kubernetes[volume_description]['scale_above_percent']:
+                    print("  BECAUSE it has inodes used above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
                 print("  ALERT has been for {} period(s) which needs to at least {} period(s) to scale".format(cache.get(volume_description), pvcs_in_kubernetes[volume_description]['scale_after_intervals']))
 
                 # Check if we are NOT in a possible scale condition
                 if cache.get(volume_description) < pvcs_in_kubernetes[volume_description]['scale_after_intervals']:
                     print("  BUT need to wait for {} intervals in alert before considering to scale".format( pvcs_in_kubernetes[volume_description]['scale_after_intervals'] ))
                     print("  FYI this has desired_size {} and current size {}".format( convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_spec_bytes']), convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes'])))
+                    print("=============================================================================================================")
                     continue
 
                 # If we are in a possible scale condition, check if we recently scaled it and handle accordingly
                 if pvcs_in_kubernetes[volume_description]['last_resized_at'] + pvcs_in_kubernetes[volume_description]['scale_cooldown_time'] >= int(time.mktime(time.gmtime())):
                     print("  BUT need to wait {} seconds to scale since the last scale time {} seconds ago".format( abs(pvcs_in_kubernetes[volume_description]['last_resized_at'] + pvcs_in_kubernetes[volume_description]['scale_cooldown_time']) - int(time.mktime(time.gmtime())), abs(pvcs_in_kubernetes[volume_description]['last_resized_at'] - int(time.mktime(time.gmtime()))) ))
+                    print("=============================================================================================================")
                     continue
 
                 # If we reach this far then we will be scaling the disk, all preconditions were passed from above
@@ -155,7 +175,7 @@
                     print("  Error/Exception while trying to determine what to resize to, volume causing failure:")
                     print("-------------------------------------------------------------------------------------------------------------")
                     print(pvcs_in_kubernetes[volume_description])
-                    print("-------------------------------------------------------------------------------------------------------------")
+                    print("=============================================================================================================")
                     continue
 
                 # If our resize bytes is less than our original size (because the user set the max-bytes to something too low)
@@ -169,33 +189,37 @@
                     print("-------------------------------------------------------------------------------------------------------------")
                     print(" Volume causing failure:")
                     print_human_readable_volume_dict(pvcs_in_kubernetes[volume_description])
-                    print("-------------------------------------------------------------------------------------------------------------")
+                    print("=============================================================================================================")
                     continue
 
                 # Check if we are already at the max volume size (either globally, or this-volume specific)
                 if resize_to_bytes == pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']:
                     print("  SKIPPING scaling this because we are at the maximum size of {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['scale_up_max_size'])))
+                    print("=============================================================================================================")
                     continue
 
                 # Check if we set on this PV we want to ignore the volume autoscaler
                 if pvcs_in_kubernetes[volume_description]['ignore']:
                     print("  IGNORING scaling this because the ignore annotation was set to true")
+                    print("=============================================================================================================")
                     continue
 
                 # Lets debounce this incase we did this resize last interval(s)
                 if cache.get(f"{volume_description}-has-been-resized"):
                     print("  DEBOUNCING and skipping this scaling, we resized within recent intervals")
+                    print("=============================================================================================================")
                     continue
 
                 # Check if we are DRY-RUN-ing and won't do anything
                 if DRY_RUN:
                     print("  DRY RUN was set, but we would have resized this disk from {} to {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), convert_bytes_to_storage(resize_to_bytes)))
+                    print("=============================================================================================================")
                     continue
 
                 # If we aren't dry-run, lets resize
                 PROMETHEUS_METRICS['resize_attempted'].inc()
                 print("  RESIZING disk from {} to {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), convert_bytes_to_storage(resize_to_bytes)))
-                status_output = "to scale up `{}` by `{}%` from `{}` to `{}`, it was using more than `{}%` disk space over the last `{} seconds`".format(
+                status_output = "to scale up `{}` by `{}%` from `{}` to `{}`, it was using more than `{}%` disk or inode space over the last `{} seconds`".format(
                     volume_description,
                     pvcs_in_kubernetes[volume_description]['scale_up_percent'],
                     convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']),
@@ -241,6 +265,9 @@
                 print(item)
                 traceback.print_exc()
 
+            if VERBOSE:
+                print("=============================================================================================================")
+
         # Wait until our next interval
         time.sleep(MAIN_LOOP_TIME)