From 3fbc93ee5241c2460aa3a5cbbc5e9663b676e0d4 Mon Sep 17 00:00:00 2001
From: Daxin Wang <46807570+dxsup@users.noreply.github.com>
Date: Thu, 1 Dec 2022 19:20:21 +0800
Subject: [PATCH] Add a new clustering method "blank" (#372)

Signed-off-by: Daxin Wang <daxinwang@harmonycloud.cn>
---
 CHANGELOG.md                                       |  2 +-
 collector/docker/kindling-collector-config.yml     |  3 ++-
 .../analyzer/network/protocol/http/http_parser.go  | 10 +---------
 collector/pkg/urlclustering/blank.go               | 14 ++++++++++++++
 collector/pkg/urlclustering/factory.go             | 14 ++++++++++++++
 deploy/agent/kindling-collector-config.yml         |  3 ++-
 6 files changed, 34 insertions(+), 12 deletions(-)
 create mode 100644 collector/pkg/urlclustering/blank.go
 create mode 100644 collector/pkg/urlclustering/factory.go

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0eee52d26..6aae0b8c3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,7 +13,7 @@
 ### Enhancements
 - Add payload for all protocols.([#375](https://github.com/KindlingProject/kindling/pull/375))
 - 
-- 
+- Add a new clustering method "blank" that is used to reduce the cardinality of metrics as much as possible. ([#372](https://github.com/KindlingProject/kindling/pull/372))
 
 ### Bug fixes
 - 
diff --git a/collector/docker/kindling-collector-config.yml b/collector/docker/kindling-collector-config.yml
index 42eb4e8cd..d28d55648 100644
--- a/collector/docker/kindling-collector-config.yml
+++ b/collector/docker/kindling-collector-config.yml
@@ -56,10 +56,11 @@ analyzers:
     protocol_parser: [ http, mysql, dns, redis, kafka, rocketmq ]
     # Which URL clustering method should be used to shorten the URL of HTTP request.
     # This is useful for decrease the cardinality of URLs.
-    # Valid values: ["noparam", "alphabet"]
+    # Valid values: ["noparam", "alphabet", "blank"]
     # - noparam: Only trim the trailing parameters behind the character '?'
     # - alphabet: Trim the trailing parameters and Convert the segments
     #             containing non-alphabetical characters to star(*)
+    # - blank: Turn endpoints to empty. This is used to reduce the cardinality as much as possible.
     url_clustering_method: alphabet
     # If the destination port of data is one of the followings, the protocol of such network request
     # is set to the corresponding one. Note the program will try to identify the protocol automatically
diff --git a/collector/pkg/component/analyzer/network/protocol/http/http_parser.go b/collector/pkg/component/analyzer/network/protocol/http/http_parser.go
index f1e614a37..ff8af4c58 100644
--- a/collector/pkg/component/analyzer/network/protocol/http/http_parser.go
+++ b/collector/pkg/component/analyzer/network/protocol/http/http_parser.go
@@ -8,15 +8,7 @@ import (
 )
 
 func NewHttpParser(urlClusteringMethod string) *protocol.ProtocolParser {
-	var method urlclustering.ClusteringMethod
-	switch urlClusteringMethod {
-	case "alphabet":
-		method = urlclustering.NewAlphabeticalClusteringMethod()
-	case "noparam":
-		method = urlclustering.NewNoParamClusteringMethod()
-	default:
-		method = urlclustering.NewAlphabeticalClusteringMethod()
-	}
+	method := urlclustering.NewMethod(urlClusteringMethod)
 	requestParser := protocol.CreatePkgParser(fastfailHttpRequest(), parseHttpRequest(method))
 	responseParser := protocol.CreatePkgParser(fastfailHttpResponse(), parseHttpResponse())
 
diff --git a/collector/pkg/urlclustering/blank.go b/collector/pkg/urlclustering/blank.go
new file mode 100644
index 000000000..d8a2b4685
--- /dev/null
+++ b/collector/pkg/urlclustering/blank.go
@@ -0,0 +1,14 @@
+package urlclustering
+
+// BlankClusteringMethod removes the endpoint and return an empty string.
+// This method is used to reduce the cardinality as much as possible.
+type BlankClusteringMethod struct {
+}
+
+func NewBlankClusteringMethod() ClusteringMethod {
+	return &BlankClusteringMethod{}
+}
+
+func (m *BlankClusteringMethod) Clustering(_ string) string {
+	return ""
+}
diff --git a/collector/pkg/urlclustering/factory.go b/collector/pkg/urlclustering/factory.go
new file mode 100644
index 000000000..ead91dbc5
--- /dev/null
+++ b/collector/pkg/urlclustering/factory.go
@@ -0,0 +1,14 @@
+package urlclustering
+
+func NewMethod(urlClusteringMethod string) ClusteringMethod {
+	switch urlClusteringMethod {
+	case "alphabet":
+		return NewAlphabeticalClusteringMethod()
+	case "noparam":
+		return NewNoParamClusteringMethod()
+	case "blank":
+		return NewBlankClusteringMethod()
+	default:
+		return NewAlphabeticalClusteringMethod()
+	}
+}
diff --git a/deploy/agent/kindling-collector-config.yml b/deploy/agent/kindling-collector-config.yml
index 6247c67c6..4ea2e82ac 100644
--- a/deploy/agent/kindling-collector-config.yml
+++ b/deploy/agent/kindling-collector-config.yml
@@ -56,10 +56,11 @@ analyzers:
     protocol_parser: [ http, mysql, dns, redis, kafka, rocketmq ]
     # Which URL clustering method should be used to shorten the URL of HTTP request.
     # This is useful for decrease the cardinality of URLs.
-    # Valid values: ["noparam", "alphabet"]
+    # Valid values: ["noparam", "alphabet", "blank"]
     # - noparam: Only trim the trailing parameters behind the character '?'
     # - alphabet: Trim the trailing parameters and Convert the segments
     #             containing non-alphabetical characters to star(*)
+    # - blank: Turn endpoints to empty. This is used to reduce the cardinality as much as possible.
     url_clustering_method: alphabet
     # If the destination port of data is one of the followings, the protocol of such network request
     # is set to the corresponding one. Note the program will try to identify the protocol automatically