From 4ab85fa766a76ba37a685b02bd14eb79dde73744 Mon Sep 17 00:00:00 2001
From: Eric Beahan <eric.beahan@elastic.co>
Date: Wed, 13 Jan 2021 16:33:14 -0600
Subject: [PATCH] [1.x] Stage 2 changes for RFC 0009 - data_stream fields
 (#1215) (#1222)

---
 experimental/generated/beats/fields.ecs.yml   | 52 ++++++++++++++
 experimental/generated/csv/fields.csv         |  3 +
 experimental/generated/ecs/ecs_flat.yml       | 46 +++++++++++++
 experimental/generated/ecs/ecs_nested.yml     | 69 +++++++++++++++++++
 .../generated/elasticsearch/7/template.json   | 13 ++++
 .../elasticsearch/component/data_stream.json  | 25 +++++++
 .../generated/elasticsearch/template.json     |  3 +-
 experimental/schemas/data_stream.yml          | 60 ++++++++++++++++
 8 files changed, 270 insertions(+), 1 deletion(-)
 create mode 100644 experimental/generated/elasticsearch/component/data_stream.json
 create mode 100644 experimental/schemas/data_stream.yml

diff --git a/experimental/generated/beats/fields.ecs.yml b/experimental/generated/beats/fields.ecs.yml
index 02da5c2ee4..d19d6a36d8 100644
--- a/experimental/generated/beats/fields.ecs.yml
+++ b/experimental/generated/beats/fields.ecs.yml
@@ -564,6 +564,58 @@
       ignore_above: 1024
       description: Runtime managing this container.
       example: docker
+  - name: data_stream
+    title: Data Stream
+    group: 2
+    description: 'The data_stream fields take part in defining the new data stream
+      naming scheme.
+
+      In the new data stream naming scheme the value of the data stream fields combine
+      to the name of the actual data stream in the following manner `{data_stream.type}-{data_stream.dataset}-{data_stream.namespace}`.
+      This means the fields can only contain characters that are valid as part of
+      names of data streams. More details about this can be found in this https://www.elastic.co/blog/an-introduction-to-the-elastic-data-stream-naming-scheme[blog
+      post].
+
+      An Elasticsearch data stream consists of one or more backing indices, and a
+      data stream name forms part of the backing indices names. Due to this convention,
+      data streams must also follow index naming restrictions. For example, data stream
+      names cannot include \, /, *, ?, ", <, >, |, ` `. Please see the Elasticsearch
+      reference for additional https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-create-index.html#indices-create-api-path-params[restrictions].'
+    type: group
+    fields:
+    - name: dataset
+      level: extended
+      type: constant_keyword
+      description: "The field can contain anything that makes sense to signify the\
+        \ source of the data.\nExamples include `nginx.access`, `prometheus`, `endpoint`\
+        \ etc. For data streams that otherwise fit, but that do not have dataset set\
+        \ we use the value \"generic\" for the dataset value. `event.dataset` should\
+        \ have the same value as `data_stream.dataset`.\nBeyond the Elasticsearch\
+        \ data stream naming criteria noted above, the `dataset` value has additional\
+        \ restrictions:\n  * Must not contain `-`\n  * No longer than 100 characters"
+      example: nginx.access
+      default_field: false
+    - name: namespace
+      level: extended
+      type: constant_keyword
+      description: "A user defined namespace. Namespaces are useful to allow grouping\
+        \ of data.\nMany users already organize their indices this way, and the data\
+        \ stream naming scheme now provides this best practice as a default. Many\
+        \ users will populate this field with `default`. If no value is used, it falls\
+        \ back to `default`.\nBeyond the Elasticsearch index naming criteria noted\
+        \ above, `namespace` value has the additional restrictions:\n  * Must not\
+        \ contain `-`\n  * No longer than 100 characters"
+      example: production
+      default_field: false
+    - name: type
+      level: extended
+      type: constant_keyword
+      description: 'An overarching type for the data stream.
+
+        Currently allowed values are "logs" and "metrics". We expect to also add "traces"
+        and "synthetics" in the near future.'
+      example: logs
+      default_field: false
   - name: destination
     title: Destination
     group: 2
diff --git a/experimental/generated/csv/fields.csv b/experimental/generated/csv/fields.csv
index b5efd516c7..95199f66a2 100644
--- a/experimental/generated/csv/fields.csv
+++ b/experimental/generated/csv/fields.csv
@@ -60,6 +60,9 @@ ECS_Version,Indexed,Field_Set,Field,Type,Level,Normalization,Example,Description
 1.9.0-dev+exp,true,container,container.labels,object,extended,,,Image labels.
 1.9.0-dev+exp,true,container,container.name,keyword,extended,,,Container name.
 1.9.0-dev+exp,true,container,container.runtime,keyword,extended,,docker,Runtime managing this container.
+1.9.0-dev+exp,true,data_stream,data_stream.dataset,constant_keyword,extended,,nginx.access,The field can contain anything that makes sense to signify the source of the data.
+1.9.0-dev+exp,true,data_stream,data_stream.namespace,constant_keyword,extended,,production,A user defined namespace. Namespaces are useful to allow grouping of data.
+1.9.0-dev+exp,true,data_stream,data_stream.type,constant_keyword,extended,,logs,An overarching type for the data stream.
 1.9.0-dev+exp,true,destination,destination.address,keyword,extended,,,Destination network address.
 1.9.0-dev+exp,true,destination,destination.as.number,long,extended,,15169,Unique number allocated to the autonomous system.
 1.9.0-dev+exp,true,destination,destination.as.organization.name,wildcard,extended,,Google LLC,Organization name.
diff --git a/experimental/generated/ecs/ecs_flat.yml b/experimental/generated/ecs/ecs_flat.yml
index f98d8b95ce..a7c053c2d1 100644
--- a/experimental/generated/ecs/ecs_flat.yml
+++ b/experimental/generated/ecs/ecs_flat.yml
@@ -705,6 +705,52 @@ container.runtime:
   normalize: []
   short: Runtime managing this container.
   type: keyword
+data_stream.dataset:
+  dashed_name: data-stream-dataset
+  description: "The field can contain anything that makes sense to signify the source\
+    \ of the data.\nExamples include `nginx.access`, `prometheus`, `endpoint` etc.\
+    \ For data streams that otherwise fit, but that do not have dataset set we use\
+    \ the value \"generic\" for the dataset value. `event.dataset` should have the\
+    \ same value as `data_stream.dataset`.\nBeyond the Elasticsearch data stream naming\
+    \ criteria noted above, the `dataset` value has additional restrictions:\n  *\
+    \ Must not contain `-`\n  * No longer than 100 characters"
+  example: nginx.access
+  flat_name: data_stream.dataset
+  level: extended
+  name: dataset
+  normalize: []
+  short: The field can contain anything that makes sense to signify the source of
+    the data.
+  type: constant_keyword
+data_stream.namespace:
+  dashed_name: data-stream-namespace
+  description: "A user defined namespace. Namespaces are useful to allow grouping\
+    \ of data.\nMany users already organize their indices this way, and the data stream\
+    \ naming scheme now provides this best practice as a default. Many users will\
+    \ populate this field with `default`. If no value is used, it falls back to `default`.\n\
+    Beyond the Elasticsearch index naming criteria noted above, `namespace` value\
+    \ has the additional restrictions:\n  * Must not contain `-`\n  * No longer than\
+    \ 100 characters"
+  example: production
+  flat_name: data_stream.namespace
+  level: extended
+  name: namespace
+  normalize: []
+  short: A user defined namespace. Namespaces are useful to allow grouping of data.
+  type: constant_keyword
+data_stream.type:
+  dashed_name: data-stream-type
+  description: 'An overarching type for the data stream.
+
+    Currently allowed values are "logs" and "metrics". We expect to also add "traces"
+    and "synthetics" in the near future.'
+  example: logs
+  flat_name: data_stream.type
+  level: extended
+  name: type
+  normalize: []
+  short: An overarching type for the data stream.
+  type: constant_keyword
 destination.address:
   dashed_name: destination-address
   description: 'Some event destination addresses are defined ambiguously. The event
diff --git a/experimental/generated/ecs/ecs_nested.yml b/experimental/generated/ecs/ecs_nested.yml
index 97acbc2459..2b825db77d 100644
--- a/experimental/generated/ecs/ecs_nested.yml
+++ b/experimental/generated/ecs/ecs_nested.yml
@@ -983,6 +983,75 @@ container:
   short: Fields describing the container that generated this event.
   title: Container
   type: group
+data_stream:
+  description: 'The data_stream fields take part in defining the new data stream naming
+    scheme.
+
+    In the new data stream naming scheme the value of the data stream fields combine
+    to the name of the actual data stream in the following manner `{data_stream.type}-{data_stream.dataset}-{data_stream.namespace}`.
+    This means the fields can only contain characters that are valid as part of names
+    of data streams. More details about this can be found in this https://www.elastic.co/blog/an-introduction-to-the-elastic-data-stream-naming-scheme[blog
+    post].
+
+    An Elasticsearch data stream consists of one or more backing indices, and a data
+    stream name forms part of the backing indices names. Due to this convention, data
+    streams must also follow index naming restrictions. For example, data stream names
+    cannot include \, /, *, ?, ", <, >, |, ` `. Please see the Elasticsearch reference
+    for additional https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-create-index.html#indices-create-api-path-params[restrictions].'
+  fields:
+    data_stream.dataset:
+      dashed_name: data-stream-dataset
+      description: "The field can contain anything that makes sense to signify the\
+        \ source of the data.\nExamples include `nginx.access`, `prometheus`, `endpoint`\
+        \ etc. For data streams that otherwise fit, but that do not have dataset set\
+        \ we use the value \"generic\" for the dataset value. `event.dataset` should\
+        \ have the same value as `data_stream.dataset`.\nBeyond the Elasticsearch\
+        \ data stream naming criteria noted above, the `dataset` value has additional\
+        \ restrictions:\n  * Must not contain `-`\n  * No longer than 100 characters"
+      example: nginx.access
+      flat_name: data_stream.dataset
+      level: extended
+      name: dataset
+      normalize: []
+      short: The field can contain anything that makes sense to signify the source
+        of the data.
+      type: constant_keyword
+    data_stream.namespace:
+      dashed_name: data-stream-namespace
+      description: "A user defined namespace. Namespaces are useful to allow grouping\
+        \ of data.\nMany users already organize their indices this way, and the data\
+        \ stream naming scheme now provides this best practice as a default. Many\
+        \ users will populate this field with `default`. If no value is used, it falls\
+        \ back to `default`.\nBeyond the Elasticsearch index naming criteria noted\
+        \ above, `namespace` value has the additional restrictions:\n  * Must not\
+        \ contain `-`\n  * No longer than 100 characters"
+      example: production
+      flat_name: data_stream.namespace
+      level: extended
+      name: namespace
+      normalize: []
+      short: A user defined namespace. Namespaces are useful to allow grouping of
+        data.
+      type: constant_keyword
+    data_stream.type:
+      dashed_name: data-stream-type
+      description: 'An overarching type for the data stream.
+
+        Currently allowed values are "logs" and "metrics". We expect to also add "traces"
+        and "synthetics" in the near future.'
+      example: logs
+      flat_name: data_stream.type
+      level: extended
+      name: type
+      normalize: []
+      short: An overarching type for the data stream.
+      type: constant_keyword
+  group: 2
+  name: data_stream
+  prefix: data_stream.
+  short: The data_stream fields take part in defining the new data stream naming scheme.
+  title: Data Stream
+  type: group
 destination:
   description: 'Destination fields capture details about the receiver of a network
     exchange/packet. These fields are populated from a network event, packet, or other
diff --git a/experimental/generated/elasticsearch/7/template.json b/experimental/generated/elasticsearch/7/template.json
index 029aa451f3..7420e1c441 100644
--- a/experimental/generated/elasticsearch/7/template.json
+++ b/experimental/generated/elasticsearch/7/template.json
@@ -303,6 +303,19 @@
           }
         }
       },
+      "data_stream": {
+        "properties": {
+          "dataset": {
+            "type": "constant_keyword"
+          },
+          "namespace": {
+            "type": "constant_keyword"
+          },
+          "type": {
+            "type": "constant_keyword"
+          }
+        }
+      },
       "destination": {
         "properties": {
           "address": {
diff --git a/experimental/generated/elasticsearch/component/data_stream.json b/experimental/generated/elasticsearch/component/data_stream.json
new file mode 100644
index 0000000000..3d4d93c586
--- /dev/null
+++ b/experimental/generated/elasticsearch/component/data_stream.json
@@ -0,0 +1,25 @@
+{
+  "_meta": {
+    "documentation": "https://www.elastic.co/guide/en/ecs/current/ecs-data_stream.html",
+    "ecs_version": "1.9.0-dev+exp"
+  },
+  "template": {
+    "mappings": {
+      "properties": {
+        "data_stream": {
+          "properties": {
+            "dataset": {
+              "type": "constant_keyword"
+            },
+            "namespace": {
+              "type": "constant_keyword"
+            },
+            "type": {
+              "type": "constant_keyword"
+            }
+          }
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/experimental/generated/elasticsearch/template.json b/experimental/generated/elasticsearch/template.json
index b8f252c020..f81f6b49dc 100644
--- a/experimental/generated/elasticsearch/template.json
+++ b/experimental/generated/elasticsearch/template.json
@@ -37,7 +37,8 @@
     "ecs_1.9.0-dev-exp_url",
     "ecs_1.9.0-dev-exp_user",
     "ecs_1.9.0-dev-exp_user_agent",
-    "ecs_1.9.0-dev-exp_vulnerability"
+    "ecs_1.9.0-dev-exp_vulnerability",
+    "ecs_1.9.0-dev-exp_data_stream"
   ],
   "index_patterns": [
     "try-ecs-*"
diff --git a/experimental/schemas/data_stream.yml b/experimental/schemas/data_stream.yml
new file mode 100644
index 0000000000..d651800fa4
--- /dev/null
+++ b/experimental/schemas/data_stream.yml
@@ -0,0 +1,60 @@
+---
+- name: data_stream
+  title: Data Stream
+  short: The data_stream fields take part in defining the new data stream naming scheme.
+  description: >
+    The data_stream fields take part in defining the new data stream naming scheme.
+
+    In the new data stream naming scheme the value of the data stream fields combine to the name of the actual data
+    stream in the following manner `{data_stream.type}-{data_stream.dataset}-{data_stream.namespace}`. This means the fields
+    can only contain characters that are valid as part of names of data streams. More details about this can be found in
+    this https://www.elastic.co/blog/an-introduction-to-the-elastic-data-stream-naming-scheme[blog post].
+
+    An Elasticsearch data stream consists of one or more backing indices, and a data stream name forms part of the backing indices names.
+    Due to this convention, data streams must also follow index naming restrictions. For example, data stream names cannot include \, /, *, ?, ", <, >, |, ` `.
+    Please see the Elasticsearch reference for additional https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-create-index.html#indices-create-api-path-params[restrictions].
+  fields:
+
+    - name: type
+      level: extended
+      type: constant_keyword
+      example: logs
+      # Any future values for `data_stream.type` should also adhere to the following restrictions (these are derived from the Elasticsearch index restrictions):
+      # * Must not contain `-`
+      # * Must not start with `+` or `_`
+      description: >
+        An overarching type for the data stream.
+
+        Currently allowed values are "logs" and "metrics". We expect to also add "traces" and "synthetics" in the near future.
+      short: An overarching type for the data stream.
+
+    - name: dataset
+      level: extended
+      type: constant_keyword
+      example: nginx.access
+      description: >
+        The field can contain anything that makes sense to signify the source of the data.
+
+        Examples include `nginx.access`, `prometheus`, `endpoint` etc. For data streams that otherwise fit, but that
+        do not have dataset set we use the value "generic" for the dataset value. `event.dataset` should have the
+        same value as `data_stream.dataset`.
+
+        Beyond the Elasticsearch data stream naming criteria noted above, the `dataset` value has additional restrictions:
+          * Must not contain `-`
+          * No longer than 100 characters
+      short: The field can contain anything that makes sense to signify the source of the data.
+
+    - name: namespace
+      level: extended
+      type: constant_keyword
+      example: production
+      description: >
+        A user defined namespace. Namespaces are useful to allow grouping of data.
+
+        Many users already organize their indices this way, and the data stream naming scheme now provides this
+        best practice as a default. Many users will populate this field with `default`. If no value is used, it falls back to `default`.
+
+        Beyond the Elasticsearch index naming criteria noted above, `namespace` value has the additional restrictions:
+          * Must not contain `-`
+          * No longer than 100 characters
+      short: A user defined namespace. Namespaces are useful to allow grouping of data.