refactor: combine sampling and outlier operators

ucbepic · Oct 12, 2024 · 38a073f · 38a073f
1 parent cf6d06f
commit 38a073f
Show file tree

Hide file tree

Showing 6 changed files with 239 additions and 170 deletions.
diff --git a/docetl/operations/outliers.py b/docetl/operations/outliers.py
diff --git a/docetl/operations/sample.py b/docetl/operations/sample.py
@@ -1,5 +1,7 @@
 from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
 from docetl.operations.base import BaseOperation
+from docetl.operations.clustering_utils import get_embeddings_for_clustering
 
 
 class SampleOperation(BaseOperation):
@@ -18,7 +20,52 @@ def syntax_check(self) -> None:
             ValueError: If required keys are missing or invalid in the configuration.
             TypeError: If configuration values have incorrect types.
         """
-        pass
+        if "samples" not in self.config and "outliers" not in self.config:
+            raise ValueError(
+                "Must specify either 'samples' or 'outliers' in SampleOperation configuration"
+            )
+
+        if "samples" in self.config:
+            if not isinstance(self.config["samples"], (int, float, list)) or (
+                isinstance(self.config["samples"], (int, float))
+                and self.config["samples"] <= 0
+            ):
+                raise TypeError("'samples' must be a positive integer, float, or list")
+
+        if "outliers" in self.config:
+            outliers_config = self.config["outliers"]
+            if "std" not in outliers_config and "samples" not in outliers_config:
+                raise ValueError(
+                    "Must specify either 'std' or 'samples' in outliers configuration"
+                )
+
+            if "std" in outliers_config:
+                if (
+                    not isinstance(outliers_config["std"], (int, float))
+                    or outliers_config["std"] <= 0
+                ):
+                    raise TypeError("'std' in outliers must be a positive number")
+
+            if "samples" in outliers_config:
+                if (
+                    not isinstance(outliers_config["samples"], (int, float))
+                    or outliers_config["samples"] <= 0
+                ):
+                    raise TypeError(
+                        "'samples' in outliers must be a positive integer or float"
+                    )
+
+            if "embedding_keys" not in outliers_config:
+                raise ValueError(
+                    "'embedding_keys' must be specified in outliers configuration"
+                )
+
+            if not isinstance(outliers_config["embedding_keys"], list) or not all(
+                isinstance(key, str) for key in outliers_config["embedding_keys"]
+            ):
+                raise TypeError(
+                    "'embedding_keys' in outliers must be a list of strings"
+                )
 
     def execute(
         self, input_data: List[Dict], is_build: bool = False
@@ -35,26 +82,62 @@ def execute(
             Tuple[List[Dict], float]: A tuple containing the filtered
               list of dictionaries and the total cost of the operation.
         """
+        cost = 0
+        if not input_data:
+            return [], cost
 
-        samples = self.config["samples"]
-        if isinstance(samples, list):
-            keys = list(samples[0].keys())
-            key_to_doc = {tuple([doc[key] for key in keys]): doc for doc in input_data}
+        if "outliers" in self.config:
+            # Outlier functionality
+            outliers_config = self.config["outliers"]
+            embeddings, embedding_cost = get_embeddings_for_clustering(
+                input_data, outliers_config, self.runner.api
+            )
+            cost += embedding_cost
+            embeddings = np.array(embeddings)
+
+            center = embeddings.mean(axis=0)
+            distances = np.sqrt(((embeddings - center) ** 2).sum(axis=1))
+
+            if "std" in outliers_config:
+                cutoff = (
+                    np.sqrt((embeddings.std(axis=0) ** 2).sum())
+                    * outliers_config["std"]
+                )
+            else:  # "samples" in outliers_config
+                distance_distribution = np.sort(distances)
+                samples = outliers_config["samples"]
+                if isinstance(samples, float):
+                    samples = int(samples * (len(distance_distribution) - 1))
+                cutoff = distance_distribution[samples]
+
+            keep = outliers_config.get("keep", False)
+            include = distances > cutoff if keep else distances <= cutoff
 
-            output_data = [
-                key_to_doc[tuple([sample[key] for key in keys])] for sample in samples
-            ]
+            output_data = [item for idx, item in enumerate(input_data) if include[idx]]
         else:
-            stratify = None
-            if "stratify" in self.config:
-                stratify = [data[self.config["stratify"]] for data in input_data]
+            samples = self.config["samples"]
+            if isinstance(samples, list):
+                keys = list(samples[0].keys())
+                key_to_doc = {
+                    tuple([doc[key] for key in keys]): doc for doc in input_data
+                }
 
-            import sklearn.model_selection
+                output_data = [
+                    key_to_doc[tuple([sample[key] for key in keys])]
+                    for sample in samples
+                ]
+            else:
+                stratify = None
+                if "stratify" in self.config:
+                    stratify = [data[self.config["stratify"]] for data in input_data]
 
-            output_data, dummy = sklearn.model_selection.train_test_split(
-                input_data,
-                train_size=samples,
-                random_state=self.config.get("random_state", None),
-                stratify=stratify,
-            )
-        return output_data, 0
+                import sklearn.model_selection
+
+                output_data, dummy = sklearn.model_selection.train_test_split(
+                    input_data,
+                    train_size=samples,
+                    random_state=self.config.get("random_state", None),
+                    stratify=stratify,
+                )
+
+        return output_data, cost
diff --git a/docs/operators/outliers.md b/docs/operators/outliers.md
diff --git a/docs/operators/sample.md b/docs/operators/sample.md
@@ -21,21 +21,44 @@ operation you add while developing your pipeline!
 ```
 
 This sample operation will return a pseudo-randomly selected 10% of
-the samples (`samples: 0.1`). The random selection will be seeded with
+the samples (samples: 0.1). The random selection will be seeded with
 a constant (42), meaning the same selection will be returned if you
 rerun the pipeline (If no random state is given, a different sample
 will be returned every time). Additionally, the random sampling will
-sample each value of the `category` key equally.
+sample each value of the category key equally.
 
 ## Required Parameters
 
-- `name`: A unique name for the operation.
-- `type`: Must be set to "sample".
-- `samples`: Either a list of key-value pairs representing document ids and values, an integer count of samples, or a float fraction of samples.
+- name: A unique name for the operation.
+- type: Must be set to "sample".
+- samples: Either a list of key-value pairs representing document ids and values, an integer count of samples, or a float fraction of samples.
 
 ## Optional Parameters
 
-| Parameter     | Description                                  | Default                             |
-| ------------- | -------------------------------------------- | ----------------------------------- |
-| `random_state | An integer to seed the random generator with | Use the (numpy) global random state |
-| `stratify`    | The key to stratify by                       |                                     |
+| Parameter    | Description                                  | Default                             |
+| ------------ | -------------------------------------------- | ----------------------------------- |
+| random_state | An integer to seed the random generator with | Use the (numpy) global random state |
+| stratify     | The key to stratify by                       |                                     |
+
+## Outliers
+
+The Sample operation can also be used to sample outliers. To do this, instead of specifying "samples", specify an "outliers" object with the following parameters:
+
+- embedding_keys: A list of keys to use for creating embeddings.
+- std: The number of standard deviations to use as the cutoff for outliers.
+- samples: The number or fraction of samples to consider as outliers.
+- keep: Whether to keep (true) or remove (false) the outliers. Defaults to false.
+
+You must specify either "std" or "samples" in the outliers configuration, but not both.
+
+Example:
+
+```yaml
+- name: remove-worst-10
+  type: sample
+  outliers:
+    embedding_keys:
+      - concept
+      - description
+    samples: 0.9
+```
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -35,6 +35,7 @@ nav:
           - Split: operators/split.md
           - Gather: operators/gather.md
           - Unnest: operators/unnest.md
+          - Sample: operators/sample.md
       - Optimization:
           - Overview: optimization/overview.md
           - Example: optimization/example.md