From 338507248fc6975ec3db642306b6476c1decb073 Mon Sep 17 00:00:00 2001
From: Pier-Hugues Pellerin <phpellerin@gmail.com>
Date: Tue, 5 Jun 2018 10:31:34 -0400
Subject: [PATCH 1/3] Optimize postgresql ingest pipeline

The postgresql ingest pipeline was not performing so well.
This PR use the following rules to improve the situation.

- Anchor the Regular expression at the begining of the string.
- Merge the multiple statements into a single RE
- Do not use back reference for user/host delimiter.

Fixes: #7201
---
 filebeat/module/postgresql/log/ingest/pipeline.json | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/filebeat/module/postgresql/log/ingest/pipeline.json b/filebeat/module/postgresql/log/ingest/pipeline.json
index 1d1904b1e384..1ba0fa62e66d 100644
--- a/filebeat/module/postgresql/log/ingest/pipeline.json
+++ b/filebeat/module/postgresql/log/ingest/pipeline.json
@@ -6,16 +6,10 @@
         "field": "message",
         "ignore_missing": true,
         "patterns": [
-          "%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] %{USERNAME:postgresql.log.user}@%{POSTGRESQL_DB_NAME:postgresql.log.database} %{WORD:postgresql.log.level}:  duration: %{NUMBER:postgresql.log.duration} ms  statement: %{MULTILINEQUERY:postgresql.log.query}",
-          "%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] \\[%{USERNAME:postgresql.log.user}\\]@\\[%{POSTGRESQL_DB_NAME:postgresql.log.database}\\] %{WORD:postgresql.log.level}:  duration: %{NUMBER:postgresql.log.duration} ms  statement: %{MULTILINEQUERY:postgresql.log.query}",
-          "%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] %{USERNAME:postgresql.log.user}@%{POSTGRESQL_DB_NAME:postgresql.log.database} %{WORD:postgresql.log.level}:  ?%{GREEDYDATA:postgresql.log.message}",
-          "%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] \\[%{USERNAME:postgresql.log.user}\\]@\\[%{POSTGRESQL_DB_NAME:postgresql.log.database}\\] %{WORD:postgresql.log.level}:  ?%{GREEDYDATA:postgresql.log.message}",
-          "%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] %{WORD:postgresql.log.level}:  ?%{GREEDYDATA:postgresql.log.message}"
+            "^%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] ((\\[%{USERNAME:postgresql.log.user}\\]@\\[%{POSTGRESQL_DB_NAME:postgresql.log.database}\\]|%{USERNAME:postgresql.log.user}@%{POSTGRESQL_DB_NAME:postgresql.log.database}) )?%{WORD:postgresql.log.level}:  (duration: %{NUMBER:postgresql.log.duration} ms  statement: %{GREEDYDATA:postgresql.log.query}|%{GREEDYDATA:postgresql.log.message})"
         ],
         "pattern_definitions": {
           "LOCALDATETIME": "[-0-9]+ %{TIME}",
-          "GREEDYDATA": ".*",
-          "MULTILINEQUERY" : "(.|\n|\t)*?;$",
           "POSTGRESQL_DB_NAME": "[a-zA-Z0-9_]+[a-zA-Z0-9_\\$]*"
         }
       }

From d8d80e56d5805eb0559ce5bb225c63f49b9e1bec Mon Sep 17 00:00:00 2001
From: Pier-Hugues Pellerin <phpellerin@gmail.com>
Date: Tue, 5 Jun 2018 15:03:41 -0400
Subject: [PATCH 2/3] multiline greedy

---
 filebeat/module/postgresql/log/ingest/pipeline.json | 1 +
 filebeat/tests/system/test_modules.py               | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/filebeat/module/postgresql/log/ingest/pipeline.json b/filebeat/module/postgresql/log/ingest/pipeline.json
index 1ba0fa62e66d..c9c33c0bb7e5 100644
--- a/filebeat/module/postgresql/log/ingest/pipeline.json
+++ b/filebeat/module/postgresql/log/ingest/pipeline.json
@@ -10,6 +10,7 @@
         ],
         "pattern_definitions": {
           "LOCALDATETIME": "[-0-9]+ %{TIME}",
+          "GREEDYDATA": "(.|\n|\t)*",
           "POSTGRESQL_DB_NAME": "[a-zA-Z0-9_]+[a-zA-Z0-9_\\$]*"
         }
       }
diff --git a/filebeat/tests/system/test_modules.py b/filebeat/tests/system/test_modules.py
index 090cf28077c1..917f5809fff6 100644
--- a/filebeat/tests/system/test_modules.py
+++ b/filebeat/tests/system/test_modules.py
@@ -113,7 +113,7 @@ def _test_expected_events(self, module, test_file, res, objects):
                     break
 
             assert found, "The following expected object was not found:\n {}\nSearched in: \n{}".format(
-                ev["_source"][module], objects)
+                pretty_json(ev["_source"][module]), pretty_json(objects))
 
     def run_on_file(self, module, fileset, test_file, cfgfile):
         print("Testing {}/{} on {}".format(module, fileset, test_file))
@@ -321,3 +321,6 @@ def _run_ml_test(self, setup_flag, modules_flag):
                                    max_timeout=30)
 
         beat.kill()
+
+def pretty_json(obj):
+    return json.dumps(obj, indent=2, separators=(',',': '))

From ccc274fa0ffa02eb68fa8536dde68f88dd359a24 Mon Sep 17 00:00:00 2001
From: Pier-Hugues Pellerin <phpellerin@gmail.com>
Date: Tue, 5 Jun 2018 16:31:29 -0400
Subject: [PATCH 3/3] pep8

---
 filebeat/tests/system/test_modules.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/filebeat/tests/system/test_modules.py b/filebeat/tests/system/test_modules.py
index 917f5809fff6..37b199715a43 100644
--- a/filebeat/tests/system/test_modules.py
+++ b/filebeat/tests/system/test_modules.py
@@ -130,7 +130,8 @@ def run_on_file(self, module, fileset, test_file, cfgfile):
             "-c", cfgfile,
             "-modules={}".format(module),
             "-M", "{module}.*.enabled=false".format(module=module),
-            "-M", "{module}.{fileset}.enabled=true".format(module=module, fileset=fileset),
+            "-M", "{module}.{fileset}.enabled=true".format(
+                module=module, fileset=fileset),
             "-M", "{module}.{fileset}.var.paths=[{test_file}]".format(
                 module=module, fileset=fileset, test_file=test_file),
             "-M", "*.*.input.close_eof=true",
@@ -157,7 +158,8 @@ def run_on_file(self, module, fileset, test_file, cfgfile):
             assert obj["fileset"]["module"] == module, "expected fileset.module={} but got {}".format(
                 module, obj["fileset"]["module"])
 
-            assert "error" not in obj, "not error expected but got: {}".format(obj)
+            assert "error" not in obj, "not error expected but got: {}".format(
+                obj)
 
             if (module == "auditd" and fileset == "log") \
                     or (module == "osquery" and fileset == "result"):
@@ -248,13 +250,16 @@ def _run_ml_test(self, setup_flag, modules_flag):
         # Clean any previous state
         for df in self.es.transport.perform_request("GET", "/_xpack/ml/datafeeds/")["datafeeds"]:
             if df["datafeed_id"] == 'filebeat-nginx-access-response_code':
-                self.es.transport.perform_request("DELETE", "/_xpack/ml/datafeeds/" + df["datafeed_id"])
+                self.es.transport.perform_request(
+                    "DELETE", "/_xpack/ml/datafeeds/" + df["datafeed_id"])
 
         for df in self.es.transport.perform_request("GET", "/_xpack/ml/anomaly_detectors/")["jobs"]:
             if df["job_id"] == 'datafeed-filebeat-nginx-access-response_code':
-                self.es.transport.perform_request("DELETE", "/_xpack/ml/anomaly_detectors/" + df["job_id"])
+                self.es.transport.perform_request(
+                    "DELETE", "/_xpack/ml/anomaly_detectors/" + df["job_id"])
 
-        shutil.rmtree(os.path.join(self.working_dir, "modules.d"), ignore_errors=True)
+        shutil.rmtree(os.path.join(self.working_dir,
+                                   "modules.d"), ignore_errors=True)
 
         # generate a minimal configuration
         cfgfile = os.path.join(self.working_dir, "filebeat.yml")
@@ -322,5 +327,6 @@ def _run_ml_test(self, setup_flag, modules_flag):
 
         beat.kill()
 
+
 def pretty_json(obj):
-    return json.dumps(obj, indent=2, separators=(',',': '))
+    return json.dumps(obj, indent=2, separators=(',', ': '))