median absolute deviation agg (elastic#34482)

This commit adds a new single value metric aggregation that calculates the statistic called median absolute deviation, which is a measure of variability that works on more types of data than standard deviation Our calculation of MAD is approximated using t-digests. In the collect phase, we collect each value visited into a t-digest. In the reduce phase, we merge all value t-digests, then create a t-digest of deviations using the first t-digest's median and centroids
pgomulka · Oct 30, 2018 · b8280ea · b8280ea
1 parent 7bd113d
commit b8280ea
Show file tree

Hide file tree

Showing 19 changed files with 2,016 additions and 0 deletions.
diff --git a/client/rest-high-level/src/main/java/org/elasticsearch/client/RestHighLevelClient.java b/client/rest-high-level/src/main/java/org/elasticsearch/client/RestHighLevelClient.java
@@ -156,6 +156,8 @@
 import org.elasticsearch.search.aggregations.metrics.SumAggregationBuilder;
 import org.elasticsearch.search.aggregations.metrics.TopHitsAggregationBuilder;
 import org.elasticsearch.search.aggregations.metrics.ValueCountAggregationBuilder;
+import org.elasticsearch.search.aggregations.metrics.MedianAbsoluteDeviationAggregationBuilder;
+import org.elasticsearch.search.aggregations.metrics.ParsedMedianAbsoluteDeviation;
 import org.elasticsearch.search.aggregations.pipeline.InternalSimpleValue;
 import org.elasticsearch.search.aggregations.pipeline.ParsedSimpleValue;
 import org.elasticsearch.search.aggregations.pipeline.InternalBucketMetricValue;
@@ -1537,6 +1539,7 @@ static List<NamedXContentRegistry.Entry> getDefaultNamedXContents() {
         map.put(InternalTDigestPercentiles.NAME, (p, c) -> ParsedTDigestPercentiles.fromXContent(p, (String) c));
         map.put(InternalTDigestPercentileRanks.NAME, (p, c) -> ParsedTDigestPercentileRanks.fromXContent(p, (String) c));
         map.put(PercentilesBucketPipelineAggregationBuilder.NAME, (p, c) -> ParsedPercentilesBucket.fromXContent(p, (String) c));
+        map.put(MedianAbsoluteDeviationAggregationBuilder.NAME, (p, c) -> ParsedMedianAbsoluteDeviation.fromXContent(p, (String) c));
         map.put(MinAggregationBuilder.NAME, (p, c) -> ParsedMin.fromXContent(p, (String) c));
         map.put(MaxAggregationBuilder.NAME, (p, c) -> ParsedMax.fromXContent(p, (String) c));
         map.put(SumAggregationBuilder.NAME, (p, c) -> ParsedSum.fromXContent(p, (String) c));

diff --git a/docs/build.gradle b/docs/build.gradle
@@ -1082,6 +1082,34 @@ buildRestTests.setups['calendar_outages_addevent'] = buildRestTests.setups['cale
            ]}
 '''
 
+// used by median absolute deviation aggregation
+buildRestTests.setups['reviews'] = '''
+  - do:
+        indices.create:
+          index: reviews
+          body:
+            settings:
+              number_of_shards: 1
+              number_of_replicas: 0
+            mappings:
+              _doc:
+                properties:
+                  product:
+                    type: keyword
+                  rating:
+                    type: long
+  - do:
+        bulk:
+          index: reviews
+          type: _doc
+          refresh: true
+          body: |
+            {"index": {"_id": "1"}}
+            {"product": "widget-foo", "rating": 1}
+            {"index": {"_id": "2"}}
+            {"product": "widget-foo", "rating": 5}
+'''
+
 buildRestTests.setups['remote_cluster'] = buildRestTests.setups['host'] + '''
   - do:
       cluster.put_settings:

diff --git a/docs/reference/aggregations/metrics.asciidoc b/docs/reference/aggregations/metrics.asciidoc
@@ -41,6 +41,8 @@ include::metrics/tophits-aggregation.asciidoc[]
 
 include::metrics/valuecount-aggregation.asciidoc[]
 
+include::metrics/median-absolute-deviation-aggregation.asciidoc[]
+
 
 
 

diff --git a/docs/reference/aggregations/metrics/median-absolute-deviation-aggregation.asciidoc b/docs/reference/aggregations/metrics/median-absolute-deviation-aggregation.asciidoc
@@ -0,0 +1,189 @@
+[[search-aggregations-metrics-median-absolute-deviation-aggregation]]
+=== Median Absolute Deviation Aggregation
+
+This `single-value` aggregation approximates the https://en.wikipedia.org/wiki/Median_absolute_deviation[median absolute deviation]
+of its search results.
+
+Median absolute deviation is a measure of variability. It is a robust
+statistic, meaning that it is useful for describing data that may have
+outliers, or may not be normally distributed. For such data it can be more
+descriptive than standard deviation.
+
+It is calculated as the median of each data point's deviation from the median
+of the entire sample. That is, for a random variable X, the median absolute
+deviation is median(|median(X) - X~i~|).
+
+==== Example
+
+Assume our data represents product reviews on a one to five star scale.
+Such reviews are usually summarized as a mean, which is easily understandable
+but doesn't describe the reviews' variability. Estimating the median absolute
+deviation can provide insight into how much reviews vary from one another.
+
+In this example we have a product which has an average rating of
+3 stars. Let's look at its ratings' median absolute deviation to determine
+how much they vary
+
+[source,js]
+---------------------------------------------------------
+GET reviews/_search
+{
+  "size": 0,
+  "aggs": {
+    "review_average": {
+      "avg": {
+        "field": "rating"
+      }
+    },
+    "review_variability": {
+      "median_absolute_deviation": {
+        "field": "rating" <1>
+      }
+    }
+  }
+}
+---------------------------------------------------------
+// CONSOLE
+// TEST[setup:reviews]
+<1> `rating` must be a numeric field
+
+The resulting median absolute deviation of `2` tells us that there is a fair
+amount of variability in the ratings. Reviewers must have diverse opinions about
+this product.
+
+[source,js]
+---------------------------------------------------------
+{
+  ...
+  "aggregations": {
+    "review_average": {
+      "value": 3.0
+    },
+    "review_variability": {
+      "value": 2.0
+    }
+  }
+}
+---------------------------------------------------------
+// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
+
+==== Approximation
+
+The naive implementation of calculating median absolute deviation stores the
+entire sample in memory, so this aggregation instead calculates an
+approximation. It uses the https://github.com/tdunning/t-digest[TDigest data structure]
+to approximate the sample median and the median of deviations from the sample
+median. For more about the approximation characteristics of TDigests, see
+<<search-aggregations-metrics-percentile-aggregation-approximation>>.
+
+The tradeoff between resource usage and accuracy of a TDigest's quantile
+approximation, and therefore the accuracy of this aggregation's approximation
+of median absolute deviation, is controlled by the `compression` parameter. A
+higher `compression` setting provides a more accurate approximation at the
+cost of higher memory usage. For more about the characteristics of the TDigest
+`compression` parameter see
+<<search-aggregations-metrics-percentile-aggregation-compression>>.
+
+[source,js]
+---------------------------------------------------------
+GET reviews/_search
+{
+  "size": 0,
+  "aggs": {
+    "review_variability": {
+      "median_absolute_deviation": {
+        "field": "rating",
+        "compression": 100
+      }
+    }
+  }
+}
+---------------------------------------------------------
+// CONSOLE
+// TEST[setup:reviews]
+
+The default `compression` value for this aggregation is `1000`. At this
+compression level this aggregation is usually within 5% of the exact result,
+but observed performance will depend on the sample data.
+
+==== Script
+
+This metric aggregation supports scripting. In our example above, product
+reviews are on a scale of one to five. If we wanted to modify them to a scale
+of one to ten, we can using scripting.
+
+To provide an inline script:
+
+[source,js]
+---------------------------------------------------------
+GET reviews/_search
+{
+  "size": 0,
+  "aggs": {
+    "review_variability": {
+      "median_absolute_deviation": {
+        "script": {
+          "lang": "painless",
+          "source": "doc['rating'].value * params.scaleFactor",
+          "params": {
+            "scaleFactor": 2
+          }
+        }
+      }
+    }
+  }
+}
+---------------------------------------------------------
+// CONSOLE
+// TEST[setup:reviews]
+
+To provide a stored script:
+
+[source,js]
+---------------------------------------------------------
+GET reviews/_search
+{
+  "size": 0,
+  "aggs": {
+    "review_variability": {
+      "median_absolute_deviation": {
+        "script": {
+          "id": "my_script",
+          "params": {
+            "field": "rating"
+          }
+        }
+      }
+    }
+  }
+}
+---------------------------------------------------------
+// CONSOLE
+// TEST[setup:reviews,stored_example_script]
+
+==== Missing value
+
+The `missing` parameter defines how documents that are missing a value should be
+treated. By default they will be ignored but it is also possible to treat them
+as if they had a value.
+
+Let's be optimistic and assume some reviewers loved the product so much that
+they forgot to give it a rating. We'll assign them five stars
+
+[source,js]
+---------------------------------------------------------
+GET reviews/_search
+{
+  "size": 0,
+  "aggs": {
+    "review_variability": {
+      "median_absolute_deviation": {
+        "field": "rating",
+        "missing": 5
+      }
+    }
+  }
+}
+---------------------------------------------------------
+// CONSOLE
+// TEST[setup:reviews]
diff --git a/.../resources/rest-api-spec/test/search.aggregation/270_median_absolute_deviation_metric.yml b/.../resources/rest-api-spec/test/search.aggregation/270_median_absolute_deviation_metric.yml
@@ -0,0 +1,143 @@
+setup:
+  - skip:
+        version: " - 6.6.0"
+        reason:  "added in 6.6.0"
+  - do:
+        indices.create:
+            index: test
+            body:
+              settings:
+                number_of_replicas: 0
+              mappings:
+                _doc:
+                  properties:
+                    int_field:
+                      type: integer
+                    double_field:
+                      type: double
+                    incomplete_field:
+                      type: integer
+  - do:
+        bulk:
+          refresh: true
+          body:
+            - index:
+                _index: test
+                _type: _doc
+            - int_field: 100
+              double_field: 100.0
+              incomplete_field: 1000
+            - index:
+                _index: test
+                _type: _doc
+            - int_field: 200
+              double_field: 200.0
+              incomplete_field: 2000
+            - index:
+                _index: test
+                _type: _doc
+            - int_field: 300
+              double_field: 300.0
+
+---
+"basic test":
+
+  - do:
+      search:
+        body:
+          aggs:
+            mad_int:
+              median_absolute_deviation:
+                field: int_field
+            mad_double:
+              median_absolute_deviation:
+                field: double_field
+
+  - match: { hits.total: 3 }
+  - length: { hits.hits: 3 }
+
+  - match: { aggregations.mad_int.value: 100 }
+  - match: { aggregations.mad_double.value: 100 }
+
+---
+"with setting compression":
+
+  - do:
+      search:
+        body:
+          aggs:
+            mad_int:
+              median_absolute_deviation:
+                field: int_field
+                compression: 500
+            mad_double:
+              median_absolute_deviation:
+                field: double_field
+                compression: 500
+
+  - match: { hits.total: 3 }
+  - length: { hits.hits: 3 }
+
+  - match: { aggregations.mad_int.value: 100 }
+  - match: { aggregations.mad_double.value: 100 }
+
+---
+"no documents":
+
+  - do:
+      search:
+        body:
+          query:
+            bool:
+              filter:
+                term:
+                  non_existent_field: non_existent_value
+          aggs:
+            mad_no_docs:
+              median_absolute_deviation:
+                field: non_existent_field
+
+  - match: { hits.total: 0 }
+  - length: { hits.hits: 0 }
+
+  - match: { aggregations.mad_no_docs.value: null }
+
+---
+"missing value":
+
+  - do:
+      search:
+        body:
+          aggs:
+            mad_missing:
+              median_absolute_deviation:
+                field: incomplete_field
+                missing: 3000
+
+  - match: { hits.total: 3 }
+  - length: { hits.hits: 3 }
+
+  - match: { aggregations.mad_missing.value: 1000 }
+
+---
+"bad arguments":
+
+  - do:
+      catch: /\[compression\] must be greater than 0. Found \[0.0\] in \[mad\]/
+      search:
+        body:
+          aggs:
+            mad:
+              median_absolute_deviation:
+                field: int_field
+                compression: 0
+
+  - do:
+      catch: /\[compression\] must be greater than 0. Found \[-1.0\] in \[mad\]/
+      search:
+        body:
+          aggs:
+            mad:
+              median_absolute_deviation:
+                field: int_field
+                compression: -1
Original file line number	Diff line number	Diff line change
Expand Up		@@ -41,6 +41,8 @@ include::metrics/tophits-aggregation.asciidoc[]

		include::metrics/valuecount-aggregation.asciidoc[]

		include::metrics/median-absolute-deviation-aggregation.asciidoc[]




Expand Down