From 338507248fc6975ec3db642306b6476c1decb073 Mon Sep 17 00:00:00 2001 From: Pier-Hugues Pellerin Date: Tue, 5 Jun 2018 10:31:34 -0400 Subject: [PATCH 1/3] Optimize postgresql ingest pipeline The postgresql ingest pipeline was not performing so well. This PR use the following rules to improve the situation. - Anchor the Regular expression at the begining of the string. - Merge the multiple statements into a single RE - Do not use back reference for user/host delimiter. Fixes: #7201 --- filebeat/module/postgresql/log/ingest/pipeline.json | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/filebeat/module/postgresql/log/ingest/pipeline.json b/filebeat/module/postgresql/log/ingest/pipeline.json index 1d1904b1e384..1ba0fa62e66d 100644 --- a/filebeat/module/postgresql/log/ingest/pipeline.json +++ b/filebeat/module/postgresql/log/ingest/pipeline.json @@ -6,16 +6,10 @@ "field": "message", "ignore_missing": true, "patterns": [ - "%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] %{USERNAME:postgresql.log.user}@%{POSTGRESQL_DB_NAME:postgresql.log.database} %{WORD:postgresql.log.level}: duration: %{NUMBER:postgresql.log.duration} ms statement: %{MULTILINEQUERY:postgresql.log.query}", - "%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] \\[%{USERNAME:postgresql.log.user}\\]@\\[%{POSTGRESQL_DB_NAME:postgresql.log.database}\\] %{WORD:postgresql.log.level}: duration: %{NUMBER:postgresql.log.duration} ms statement: %{MULTILINEQUERY:postgresql.log.query}", - "%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] %{USERNAME:postgresql.log.user}@%{POSTGRESQL_DB_NAME:postgresql.log.database} %{WORD:postgresql.log.level}: ?%{GREEDYDATA:postgresql.log.message}", - "%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] \\[%{USERNAME:postgresql.log.user}\\]@\\[%{POSTGRESQL_DB_NAME:postgresql.log.database}\\] %{WORD:postgresql.log.level}: ?%{GREEDYDATA:postgresql.log.message}", - "%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] %{WORD:postgresql.log.level}: ?%{GREEDYDATA:postgresql.log.message}" + "^%{LOCALDATETIME:postgresql.log.timestamp} %{WORD:postgresql.log.timezone} \\[%{NUMBER:postgresql.log.thread_id}\\] ((\\[%{USERNAME:postgresql.log.user}\\]@\\[%{POSTGRESQL_DB_NAME:postgresql.log.database}\\]|%{USERNAME:postgresql.log.user}@%{POSTGRESQL_DB_NAME:postgresql.log.database}) )?%{WORD:postgresql.log.level}: (duration: %{NUMBER:postgresql.log.duration} ms statement: %{GREEDYDATA:postgresql.log.query}|%{GREEDYDATA:postgresql.log.message})" ], "pattern_definitions": { "LOCALDATETIME": "[-0-9]+ %{TIME}", - "GREEDYDATA": ".*", - "MULTILINEQUERY" : "(.|\n|\t)*?;$", "POSTGRESQL_DB_NAME": "[a-zA-Z0-9_]+[a-zA-Z0-9_\\$]*" } } From d8d80e56d5805eb0559ce5bb225c63f49b9e1bec Mon Sep 17 00:00:00 2001 From: Pier-Hugues Pellerin Date: Tue, 5 Jun 2018 15:03:41 -0400 Subject: [PATCH 2/3] multiline greedy --- filebeat/module/postgresql/log/ingest/pipeline.json | 1 + filebeat/tests/system/test_modules.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/filebeat/module/postgresql/log/ingest/pipeline.json b/filebeat/module/postgresql/log/ingest/pipeline.json index 1ba0fa62e66d..c9c33c0bb7e5 100644 --- a/filebeat/module/postgresql/log/ingest/pipeline.json +++ b/filebeat/module/postgresql/log/ingest/pipeline.json @@ -10,6 +10,7 @@ ], "pattern_definitions": { "LOCALDATETIME": "[-0-9]+ %{TIME}", + "GREEDYDATA": "(.|\n|\t)*", "POSTGRESQL_DB_NAME": "[a-zA-Z0-9_]+[a-zA-Z0-9_\\$]*" } } diff --git a/filebeat/tests/system/test_modules.py b/filebeat/tests/system/test_modules.py index 090cf28077c1..917f5809fff6 100644 --- a/filebeat/tests/system/test_modules.py +++ b/filebeat/tests/system/test_modules.py @@ -113,7 +113,7 @@ def _test_expected_events(self, module, test_file, res, objects): break assert found, "The following expected object was not found:\n {}\nSearched in: \n{}".format( - ev["_source"][module], objects) + pretty_json(ev["_source"][module]), pretty_json(objects)) def run_on_file(self, module, fileset, test_file, cfgfile): print("Testing {}/{} on {}".format(module, fileset, test_file)) @@ -321,3 +321,6 @@ def _run_ml_test(self, setup_flag, modules_flag): max_timeout=30) beat.kill() + +def pretty_json(obj): + return json.dumps(obj, indent=2, separators=(',',': ')) From ccc274fa0ffa02eb68fa8536dde68f88dd359a24 Mon Sep 17 00:00:00 2001 From: Pier-Hugues Pellerin Date: Tue, 5 Jun 2018 16:31:29 -0400 Subject: [PATCH 3/3] pep8 --- filebeat/tests/system/test_modules.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/filebeat/tests/system/test_modules.py b/filebeat/tests/system/test_modules.py index 917f5809fff6..37b199715a43 100644 --- a/filebeat/tests/system/test_modules.py +++ b/filebeat/tests/system/test_modules.py @@ -130,7 +130,8 @@ def run_on_file(self, module, fileset, test_file, cfgfile): "-c", cfgfile, "-modules={}".format(module), "-M", "{module}.*.enabled=false".format(module=module), - "-M", "{module}.{fileset}.enabled=true".format(module=module, fileset=fileset), + "-M", "{module}.{fileset}.enabled=true".format( + module=module, fileset=fileset), "-M", "{module}.{fileset}.var.paths=[{test_file}]".format( module=module, fileset=fileset, test_file=test_file), "-M", "*.*.input.close_eof=true", @@ -157,7 +158,8 @@ def run_on_file(self, module, fileset, test_file, cfgfile): assert obj["fileset"]["module"] == module, "expected fileset.module={} but got {}".format( module, obj["fileset"]["module"]) - assert "error" not in obj, "not error expected but got: {}".format(obj) + assert "error" not in obj, "not error expected but got: {}".format( + obj) if (module == "auditd" and fileset == "log") \ or (module == "osquery" and fileset == "result"): @@ -248,13 +250,16 @@ def _run_ml_test(self, setup_flag, modules_flag): # Clean any previous state for df in self.es.transport.perform_request("GET", "/_xpack/ml/datafeeds/")["datafeeds"]: if df["datafeed_id"] == 'filebeat-nginx-access-response_code': - self.es.transport.perform_request("DELETE", "/_xpack/ml/datafeeds/" + df["datafeed_id"]) + self.es.transport.perform_request( + "DELETE", "/_xpack/ml/datafeeds/" + df["datafeed_id"]) for df in self.es.transport.perform_request("GET", "/_xpack/ml/anomaly_detectors/")["jobs"]: if df["job_id"] == 'datafeed-filebeat-nginx-access-response_code': - self.es.transport.perform_request("DELETE", "/_xpack/ml/anomaly_detectors/" + df["job_id"]) + self.es.transport.perform_request( + "DELETE", "/_xpack/ml/anomaly_detectors/" + df["job_id"]) - shutil.rmtree(os.path.join(self.working_dir, "modules.d"), ignore_errors=True) + shutil.rmtree(os.path.join(self.working_dir, + "modules.d"), ignore_errors=True) # generate a minimal configuration cfgfile = os.path.join(self.working_dir, "filebeat.yml") @@ -322,5 +327,6 @@ def _run_ml_test(self, setup_flag, modules_flag): beat.kill() + def pretty_json(obj): - return json.dumps(obj, indent=2, separators=(',',': ')) + return json.dumps(obj, indent=2, separators=(',', ': '))