diff --git a/CHANGELOG.next.md b/CHANGELOG.next.md index 4f624151f1..0b620fffc2 100644 --- a/CHANGELOG.next.md +++ b/CHANGELOG.next.md @@ -38,6 +38,9 @@ Thanks, you're awesome :-) --> #### Added +* Introduced `--strict` flag to perform stricter schema validation when running the generator script. #937 +* Added check under `--strict` that ensures composite types in example fields are quoted. #966 + #### Improvements * Field details Jinja2 template components have been consolidated into one template #897 diff --git a/USAGE.md b/USAGE.md index 334879892e..398226e356 100644 --- a/USAGE.md +++ b/USAGE.md @@ -294,6 +294,51 @@ The `--template-settings` argument defines [index level settings](https://www.el For `template.json`, the `mappings` object is left empty: `{}`. Likewise the `properties` object remains empty in the `mapping.json` example. This will be filled in automatically by the script. +#### Strict Mode + +The `--strict` argument enables "strict mode". Strict mode performs a stricter validation step against the schema's contents. + +Basic usage: + +``` +$ python/generator.py --strict +``` + +Strict mode requires the following conditions, else the script exits on an exception: + +* Short descriptions must be less than or equal to 120 characters. +* Example values containing arrays or objects must be quoted to avoid unexpected YAML interpretation when the schema files or artifacts are relied on downstream. + +The current artifacts generated and published in the ECS repo will always be created using strict mode. However, older ECS versions (pre `v1.5.0`) will cause +an exception if attempting to generate them using `--strict`. This is due to schema validation checks introduced after that version was released. + +Example: + +``` +$ python scripts/generator.py --ref v1.4.0 --strict +Loading schemas from git ref v1.4.0 +Running generator. ECS version 1.4.0 +... +ValueError: Short descriptions must be single line, and under 120 characters (current length: 134). +Offending field or field set: number +Short description: + Unique number allocated to the autonomous system. The autonomous system number (ASN) uniquely identifies each network on the Internet. +``` + +Removing `--strict` will display a warning message, but the script will finish its run successfully: + +``` +$ python scripts/generator.py --ref v1.4.0 +Loading schemas from git ref v1.4.0 +Running generator. ECS version 1.4.0 +/Users/ericbeahan/dev/ecs/scripts/generators/ecs_helpers.py:176: UserWarning: Short descriptions must be single line, and under 120 characters (current length: 134). +Offending field or field set: number +Short description: + Unique number allocated to the autonomous system. The autonomous system number (ASN) uniquely identifies each network on the Internet. + +This will cause an exception when running in strict mode. +``` + #### Intermediate-Only The `--intermediate-only` argument is used for debugging purposes. It only generates the ["intermediate files"](generated/ecs), `ecs_flat.yml` and `ecs_nested.yml`, without generating the rest of the artifacts. diff --git a/docs/field-details.asciidoc b/docs/field-details.asciidoc index da97a3c5f8..2f06d7194d 100644 --- a/docs/field-details.asciidoc +++ b/docs/field-details.asciidoc @@ -1211,7 +1211,7 @@ Note: this field should contain an array of values. -example: `['RD', 'RA']` +example: `["RD", "RA"]` | extended @@ -1343,7 +1343,7 @@ Note: this field should contain an array of values. -example: `['10.10.10.10', '10.10.10.11']` +example: `["10.10.10.10", "10.10.10.11"]` | extended @@ -4205,7 +4205,7 @@ Note: this field should contain an array of values. -example: `['/usr/bin/ssh', '-l', 'user', '10.0.0.16']` +example: `["/usr/bin/ssh", "-l", "user", "10.0.0.16"]` | extended @@ -4718,7 +4718,7 @@ Note: this field should contain an array of values. -example: `['Star-Lord']` +example: `["Star-Lord"]` | extended @@ -5624,7 +5624,7 @@ Note: this field should contain an array of values. -example: `['MII...', 'MII...']` +example: `["MII...", "MII..."]` | extended @@ -5757,7 +5757,7 @@ Note: this field should contain an array of values. -example: `['TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', '...']` +example: `["TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384", "..."]` | extended @@ -5838,7 +5838,7 @@ Note: this field should contain an array of values. -example: `['MII...', 'MII...']` +example: `["MII...", "MII..."]` | extended diff --git a/generated/beats/fields.ecs.yml b/generated/beats/fields.ecs.yml index c632daaead..573abe8499 100644 --- a/generated/beats/fields.ecs.yml +++ b/generated/beats/fields.ecs.yml @@ -1013,9 +1013,7 @@ description: 'Array of 2 letter DNS header flags. Expected values are: AA, TC, RD, RA, AD, CD, DO.' - example: - - RD - - RA + example: '["RD", "RA"]' - name: id level: extended type: keyword @@ -1096,9 +1094,7 @@ formats it can contain. Extracting all IP addresses seen in there to `dns.resolved_ip` makes it possible to index them as IP addresses, and makes them easier to visualize and query for.' - example: - - 10.10.10.10 - - 10.10.10.11 + example: '["10.10.10.10", "10.10.10.11"]' - name: response_code level: extended type: keyword @@ -3229,11 +3225,7 @@ the executable. May be filtered to protect sensitive information.' - example: - - /usr/bin/ssh - - -l - - user - - 10.0.0.16 + example: '["/usr/bin/ssh", "-l", "user", "10.0.0.16"]' - name: args_count level: extended type: long @@ -3376,11 +3368,7 @@ the executable. May be filtered to protect sensitive information.' - example: - - /usr/bin/ssh - - -l - - user - - 10.0.0.16 + example: '["/usr/bin/ssh", "-l", "user", "10.0.0.16"]' default_field: false - name: parent.args_count level: extended @@ -3884,8 +3872,7 @@ ignore_above: 1024 description: Name, organization, or pseudonym of the author or authors who created the rule used to generate this event. - example: - - Star-Lord + example: '["Star-Lord"]' default_field: false - name: category level: extended @@ -4652,9 +4639,7 @@ description: Array of PEM-encoded certificates that make up the certificate chain offered by the client. This is usually mutually-exclusive of `client.certificate` since that value should be the first certificate in the chain. - example: - - MII... - - MII... + example: '["MII...", "MII..."]' default_field: false - name: client.hash.md5 level: extended @@ -4735,10 +4720,8 @@ type: keyword ignore_above: 1024 description: Array of ciphers offered by the client during the client hello. - example: - - TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 - - TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 - - '...' + example: '["TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384", + "..."]' default_field: false - name: client.x509.alternative_names level: extended @@ -4955,9 +4938,7 @@ description: Array of PEM-encoded certificates that make up the certificate chain offered by the server. This is usually mutually-exclusive of `server.certificate` since that value should be the first certificate in the chain. - example: - - MII... - - MII... + example: '["MII...", "MII..."]' default_field: false - name: server.hash.md5 level: extended diff --git a/generated/ecs/ecs_flat.yml b/generated/ecs/ecs_flat.yml index c27228d794..43a72942f3 100644 --- a/generated/ecs/ecs_flat.yml +++ b/generated/ecs/ecs_flat.yml @@ -1384,9 +1384,7 @@ dns.header_flags: description: 'Array of 2 letter DNS header flags. Expected values are: AA, TC, RD, RA, AD, CD, DO.' - example: - - RD - - RA + example: '["RD", "RA"]' flat_name: dns.header_flags ignore_above: 1024 level: extended @@ -1514,9 +1512,7 @@ dns.resolved_ip: it can contain. Extracting all IP addresses seen in there to `dns.resolved_ip` makes it possible to index them as IP addresses, and makes them easier to visualize and query for.' - example: - - 10.10.10.10 - - 10.10.10.11 + example: '["10.10.10.10", "10.10.10.11"]' flat_name: dns.resolved_ip level: extended name: resolved_ip @@ -4777,11 +4773,7 @@ process.args: executable. May be filtered to protect sensitive information.' - example: - - /usr/bin/ssh - - -l - - user - - 10.0.0.16 + example: '["/usr/bin/ssh", "-l", "user", "10.0.0.16"]' flat_name: process.args ignore_above: 1024 level: extended @@ -5007,11 +4999,7 @@ process.parent.args: executable. May be filtered to protect sensitive information.' - example: - - /usr/bin/ssh - - -l - - user - - 10.0.0.16 + example: '["/usr/bin/ssh", "-l", "user", "10.0.0.16"]' flat_name: process.parent.args ignore_above: 1024 level: extended @@ -5778,8 +5766,7 @@ rule.author: dashed_name: rule-author description: Name, organization, or pseudonym of the author or authors who created the rule used to generate this event. - example: - - Star-Lord + example: '["Star-Lord"]' flat_name: rule.author ignore_above: 1024 level: extended @@ -6998,9 +6985,7 @@ tls.client.certificate_chain: description: Array of PEM-encoded certificates that make up the certificate chain offered by the client. This is usually mutually-exclusive of `client.certificate` since that value should be the first certificate in the chain. - example: - - MII... - - MII... + example: '["MII...", "MII..."]' flat_name: tls.client.certificate_chain ignore_above: 1024 level: extended @@ -7126,10 +7111,8 @@ tls.client.subject: tls.client.supported_ciphers: dashed_name: tls-client-supported-ciphers description: Array of ciphers offered by the client during the client hello. - example: - - TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 - - TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 - - '...' + example: '["TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384", + "..."]' flat_name: tls.client.supported_ciphers ignore_above: 1024 level: extended @@ -7508,9 +7491,7 @@ tls.server.certificate_chain: description: Array of PEM-encoded certificates that make up the certificate chain offered by the server. This is usually mutually-exclusive of `server.certificate` since that value should be the first certificate in the chain. - example: - - MII... - - MII... + example: '["MII...", "MII..."]' flat_name: tls.server.certificate_chain ignore_above: 1024 level: extended diff --git a/generated/ecs/ecs_nested.yml b/generated/ecs/ecs_nested.yml index 490b00eb74..c56839e37b 100644 --- a/generated/ecs/ecs_nested.yml +++ b/generated/ecs/ecs_nested.yml @@ -1735,9 +1735,7 @@ dns: description: 'Array of 2 letter DNS header flags. Expected values are: AA, TC, RD, RA, AD, CD, DO.' - example: - - RD - - RA + example: '["RD", "RA"]' flat_name: dns.header_flags ignore_above: 1024 level: extended @@ -1866,9 +1864,7 @@ dns: formats it can contain. Extracting all IP addresses seen in there to `dns.resolved_ip` makes it possible to index them as IP addresses, and makes them easier to visualize and query for.' - example: - - 10.10.10.10 - - 10.10.10.11 + example: '["10.10.10.10", "10.10.10.11"]' flat_name: dns.resolved_ip level: extended name: resolved_ip @@ -5824,11 +5820,7 @@ process: the executable. May be filtered to protect sensitive information.' - example: - - /usr/bin/ssh - - -l - - user - - 10.0.0.16 + example: '["/usr/bin/ssh", "-l", "user", "10.0.0.16"]' flat_name: process.args ignore_above: 1024 level: extended @@ -6054,11 +6046,7 @@ process: the executable. May be filtered to protect sensitive information.' - example: - - /usr/bin/ssh - - -l - - user - - 10.0.0.16 + example: '["/usr/bin/ssh", "-l", "user", "10.0.0.16"]' flat_name: process.parent.args ignore_above: 1024 level: extended @@ -6890,8 +6878,7 @@ rule: dashed_name: rule-author description: Name, organization, or pseudonym of the author or authors who created the rule used to generate this event. - example: - - Star-Lord + example: '["Star-Lord"]' flat_name: rule.author ignore_above: 1024 level: extended @@ -8193,9 +8180,7 @@ tls: description: Array of PEM-encoded certificates that make up the certificate chain offered by the client. This is usually mutually-exclusive of `client.certificate` since that value should be the first certificate in the chain. - example: - - MII... - - MII... + example: '["MII...", "MII..."]' flat_name: tls.client.certificate_chain ignore_above: 1024 level: extended @@ -8324,10 +8309,8 @@ tls: tls.client.supported_ciphers: dashed_name: tls-client-supported-ciphers description: Array of ciphers offered by the client during the client hello. - example: - - TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 - - TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 - - '...' + example: '["TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384", + "..."]' flat_name: tls.client.supported_ciphers ignore_above: 1024 level: extended @@ -8706,9 +8689,7 @@ tls: description: Array of PEM-encoded certificates that make up the certificate chain offered by the server. This is usually mutually-exclusive of `server.certificate` since that value should be the first certificate in the chain. - example: - - MII... - - MII... + example: '["MII...", "MII..."]' flat_name: tls.server.certificate_chain ignore_above: 1024 level: extended diff --git a/schemas/README.md b/schemas/README.md index 9d1ac97696..c87be195a3 100644 --- a/schemas/README.md +++ b/schemas/README.md @@ -125,7 +125,9 @@ Supported keys to describe fields Defaults to the main description when absent. If the main description has multiple paragraphs, then a 'short' description with no newlines is required. -- example (optional): A single value example of what can be expected in this field +- example (optional): A single value example of what can be expected in this field. + Example values that are composite types (array, object) should be quoted to avoid YAML interpretation + in ECS-generated artifacts and other downstream projects depending on the schema. - multi\_fields (optional): Specify additional ways to index the field. - index (optional): If `False`, means field is not indexed (overrides type) - format: Field format that can be used in a Kibana index template. diff --git a/schemas/dns.yml b/schemas/dns.yml index 0c396a4a0f..afe11a190a 100644 --- a/schemas/dns.yml +++ b/schemas/dns.yml @@ -54,7 +54,7 @@ Array of 2 letter DNS header flags. Expected values are: AA, TC, RD, RA, AD, CD, DO. - example: [RD, RA] + example: "[\"RD\", \"RA\"]" normalize: - array @@ -205,6 +205,6 @@ data formats it can contain. Extracting all IP addresses seen in there to `dns.resolved_ip` makes it possible to index them as IP addresses, and makes them easier to visualize and query for. - example: [10.10.10.10, 10.10.10.11] + example: '["10.10.10.10", "10.10.10.11"]' normalize: - array diff --git a/schemas/process.yml b/schemas/process.yml index b8f1f4b11e..13ec63c07f 100644 --- a/schemas/process.yml +++ b/schemas/process.yml @@ -92,7 +92,7 @@ Array of process arguments, starting with the absolute path to the executable. May be filtered to protect sensitive information. - example: ["/usr/bin/ssh", "-l", "user", "10.0.0.16"] + example: "[\"/usr/bin/ssh\", \"-l\", \"user\", \"10.0.0.16\"]" normalize: - array diff --git a/schemas/rule.yml b/schemas/rule.yml index a9f6966705..c0daf79892 100644 --- a/schemas/rule.yml +++ b/schemas/rule.yml @@ -88,7 +88,7 @@ description: > Name, organization, or pseudonym of the author or authors who created the rule used to generate this event. - example: ['Star-Lord'] + example: "[\"Star-Lord\"]" normalize: - array diff --git a/schemas/tls.yml b/schemas/tls.yml index 569f09d54a..3ecacb041a 100644 --- a/schemas/tls.yml +++ b/schemas/tls.yml @@ -73,7 +73,7 @@ type: keyword level: extended description: Array of ciphers offered by the client during the client hello. - example: ["TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384", "..."] + example: "[\"TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384\", \"TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384\", \"...\"]" normalize: - array @@ -109,7 +109,7 @@ Array of PEM-encoded certificates that make up the certificate chain offered by the client. This is usually mutually-exclusive of `client.certificate` since that value should be the first certificate in the chain. - example: ["MII...", "MII..."] + example: "[\"MII...\", \"MII...\"]" normalize: - array @@ -188,7 +188,7 @@ Array of PEM-encoded certificates that make up the certificate chain offered by the server. This is usually mutually-exclusive of `server.certificate` since that value should be the first certificate in the chain. - example: ["MII...", "MII..."] + example: "[\"MII...\", \"MII...\"]" normalize: - array diff --git a/scripts/schema/cleaner.py b/scripts/schema/cleaner.py index ec48598bea..8983452689 100644 --- a/scripts/schema/cleaner.py +++ b/scripts/schema/cleaner.py @@ -165,7 +165,9 @@ def field_mandatory_attributes(field): def field_assertions_and_warnings(field): '''Additional checks on a fleshed out field''' if not ecs_helpers.is_intermediate(field): - single_line_short_description(field) + # check short description length if in strict mode + single_line_short_description(field, strict=strict_mode) + check_example_value(field, strict=strict_mode) if field['field_details']['level'] not in ACCEPTABLE_FIELD_LEVELS: msg = "Invalid level for field '{}'.\nValue: {}\nAcceptable values: {}".format( field['field_details']['name'], field['field_details']['level'], @@ -186,4 +188,22 @@ def single_line_short_description(schema_or_field): msg += "Offending field or field set: {}\nShort description:\n {}".format( schema_or_field['field_details']['name'], schema_or_field['field_details']['short']) - raise ValueError(msg) + if strict: + raise ValueError(msg) + else: + ecs_helpers.strict_warning(msg) + + +def check_example_value(field, strict=True): + """ + Checks if value of the example field is of type list or dict. + Fails or warns (depending on strict mode) if so. + """ + example_value = field['field_details'].get('example', None) + if isinstance(example_value, (list, dict)): + name = field['field_details']['name'] + msg = f"Example value for field `{name}` contains an object or array which must be quoted to avoid YAML interpretation." + if strict: + raise ValueError(msg) + else: + ecs_helpers.strict_warning(msg) diff --git a/scripts/tests/unit/test_schema_cleaner.py b/scripts/tests/unit/test_schema_cleaner.py index ed82218706..8298a32bb3 100644 --- a/scripts/tests/unit/test_schema_cleaner.py +++ b/scripts/tests/unit/test_schema_cleaner.py @@ -262,6 +262,84 @@ def test_multiline_short_description_raises(self): with self.assertRaisesRegex(ValueError, 'single line'): cleaner.single_line_short_description(schema) + def test_very_long_short_description_warns_strict_disabled(self): + schema = {'field_details': { + 'name': 'fake_schema', + 'short': "Single line but really long. " * 10}} + try: + with self.assertWarnsRegex(UserWarning, 'under 120 characters \(current length: 290\)'): + cleaner.single_line_short_description(schema, strict=False) + except Exception: + self.fail("cleaner.single_line_short_description() raised Exception unexpectedly.") + + def test_multiline_short_description_warns_strict_disabled(self): + schema = {'field_details': { + 'name': 'fake_schema', + 'short': "multiple\nlines"}} + try: + with self.assertWarnsRegex(UserWarning, 'single line'): + cleaner.single_line_short_description(schema, strict=False) + except Exception: + self.fail("cleaner.single_line_short_description() raised Exception unexpectedly.") + + def test_field_example_value_is_object_raises(self): + field = { + 'field_details': { + 'name': 'test', + 'example': { + 'a': 'bob', + 'b': 'alice' + } + } + } + with self.assertRaisesRegex(ValueError, 'contains an object or array'): + cleaner.check_example_value(field) + + def test_field_example_value_is_array_raises(self): + field = { + 'field_details': { + 'name': 'test', + 'example': [ + 'bob', + 'alice' + ] + } + } + with self.assertRaisesRegex(ValueError, 'contains an object or array'): + cleaner.check_example_value(field) + + def test_example_field_value_is_object_warns_strict_disabled(self): + field = { + 'field_details': { + 'name': 'test', + 'example': { + 'a': 'bob', + 'b': 'alice' + } + } + } + try: + with self.assertWarnsRegex(UserWarning, 'contains an object or array'): + cleaner.check_example_value(field, strict=False) + except Exception: + self.fail("cleaner.check_example_value() raised Exception unexpectedly.") + + def test_example_field_value_is_array_warns_strict_disabled(self): + field = { + 'field_details': { + 'name': 'test', + 'example': [ + 'bob', + 'alice' + ] + } + } + try: + with self.assertWarnsRegex(UserWarning, 'contains an object or array'): + cleaner.check_example_value(field, strict=False) + except Exception: + self.fail("cleaner.check_example_value() raised Exception unexpectedly.") + def test_clean(self): '''A high level sanity test''' fields = self.schema_process()