Skip to content

Commit

Permalink
refactor: combine sampling and outlier operators
Browse files Browse the repository at this point in the history
  • Loading branch information
shreyashankar committed Oct 12, 2024
1 parent cf6d06f commit 38a073f
Show file tree
Hide file tree
Showing 6 changed files with 239 additions and 170 deletions.
82 changes: 0 additions & 82 deletions docetl/operations/outliers.py

This file was deleted.

121 changes: 102 additions & 19 deletions docetl/operations/sample.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from docetl.operations.base import BaseOperation
from docetl.operations.clustering_utils import get_embeddings_for_clustering


class SampleOperation(BaseOperation):
Expand All @@ -18,7 +20,52 @@ def syntax_check(self) -> None:
ValueError: If required keys are missing or invalid in the configuration.
TypeError: If configuration values have incorrect types.
"""
pass
if "samples" not in self.config and "outliers" not in self.config:
raise ValueError(
"Must specify either 'samples' or 'outliers' in SampleOperation configuration"
)

if "samples" in self.config:
if not isinstance(self.config["samples"], (int, float, list)) or (
isinstance(self.config["samples"], (int, float))
and self.config["samples"] <= 0
):
raise TypeError("'samples' must be a positive integer, float, or list")

if "outliers" in self.config:
outliers_config = self.config["outliers"]
if "std" not in outliers_config and "samples" not in outliers_config:
raise ValueError(
"Must specify either 'std' or 'samples' in outliers configuration"
)

if "std" in outliers_config:
if (
not isinstance(outliers_config["std"], (int, float))
or outliers_config["std"] <= 0
):
raise TypeError("'std' in outliers must be a positive number")

if "samples" in outliers_config:
if (
not isinstance(outliers_config["samples"], (int, float))
or outliers_config["samples"] <= 0
):
raise TypeError(
"'samples' in outliers must be a positive integer or float"
)

if "embedding_keys" not in outliers_config:
raise ValueError(
"'embedding_keys' must be specified in outliers configuration"
)

if not isinstance(outliers_config["embedding_keys"], list) or not all(
isinstance(key, str) for key in outliers_config["embedding_keys"]
):
raise TypeError(
"'embedding_keys' in outliers must be a list of strings"
)

def execute(
self, input_data: List[Dict], is_build: bool = False
Expand All @@ -35,26 +82,62 @@ def execute(
Tuple[List[Dict], float]: A tuple containing the filtered
list of dictionaries and the total cost of the operation.
"""
cost = 0
if not input_data:
return [], cost

samples = self.config["samples"]
if isinstance(samples, list):
keys = list(samples[0].keys())
key_to_doc = {tuple([doc[key] for key in keys]): doc for doc in input_data}
if "outliers" in self.config:
# Outlier functionality
outliers_config = self.config["outliers"]
embeddings, embedding_cost = get_embeddings_for_clustering(
input_data, outliers_config, self.runner.api
)
cost += embedding_cost
embeddings = np.array(embeddings)

center = embeddings.mean(axis=0)
distances = np.sqrt(((embeddings - center) ** 2).sum(axis=1))

if "std" in outliers_config:
cutoff = (
np.sqrt((embeddings.std(axis=0) ** 2).sum())
* outliers_config["std"]
)
else: # "samples" in outliers_config
distance_distribution = np.sort(distances)
samples = outliers_config["samples"]
if isinstance(samples, float):
samples = int(samples * (len(distance_distribution) - 1))
cutoff = distance_distribution[samples]

keep = outliers_config.get("keep", False)
include = distances > cutoff if keep else distances <= cutoff

output_data = [
key_to_doc[tuple([sample[key] for key in keys])] for sample in samples
]
output_data = [item for idx, item in enumerate(input_data) if include[idx]]
else:
stratify = None
if "stratify" in self.config:
stratify = [data[self.config["stratify"]] for data in input_data]
samples = self.config["samples"]
if isinstance(samples, list):
keys = list(samples[0].keys())
key_to_doc = {
tuple([doc[key] for key in keys]): doc for doc in input_data
}

import sklearn.model_selection
output_data = [
key_to_doc[tuple([sample[key] for key in keys])]
for sample in samples
]
else:
stratify = None
if "stratify" in self.config:
stratify = [data[self.config["stratify"]] for data in input_data]

output_data, dummy = sklearn.model_selection.train_test_split(
input_data,
train_size=samples,
random_state=self.config.get("random_state", None),
stratify=stratify,
)
return output_data, 0
import sklearn.model_selection

output_data, dummy = sklearn.model_selection.train_test_split(
input_data,
train_size=samples,
random_state=self.config.get("random_state", None),
stratify=stratify,
)

return output_data, cost
60 changes: 0 additions & 60 deletions docs/operators/outliers.md

This file was deleted.

41 changes: 32 additions & 9 deletions docs/operators/sample.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,44 @@ operation you add while developing your pipeline!
```
This sample operation will return a pseudo-randomly selected 10% of
the samples (`samples: 0.1`). The random selection will be seeded with
the samples (samples: 0.1). The random selection will be seeded with
a constant (42), meaning the same selection will be returned if you
rerun the pipeline (If no random state is given, a different sample
will be returned every time). Additionally, the random sampling will
sample each value of the `category` key equally.
sample each value of the category key equally.
## Required Parameters
- `name`: A unique name for the operation.
- `type`: Must be set to "sample".
- `samples`: Either a list of key-value pairs representing document ids and values, an integer count of samples, or a float fraction of samples.
- name: A unique name for the operation.
- type: Must be set to "sample".
- samples: Either a list of key-value pairs representing document ids and values, an integer count of samples, or a float fraction of samples.
## Optional Parameters
| Parameter | Description | Default |
| ------------- | -------------------------------------------- | ----------------------------------- |
| `random_state | An integer to seed the random generator with | Use the (numpy) global random state |
| `stratify` | The key to stratify by | |
| Parameter | Description | Default |
| ------------ | -------------------------------------------- | ----------------------------------- |
| random_state | An integer to seed the random generator with | Use the (numpy) global random state |
| stratify | The key to stratify by | |
## Outliers
The Sample operation can also be used to sample outliers. To do this, instead of specifying "samples", specify an "outliers" object with the following parameters:
- embedding_keys: A list of keys to use for creating embeddings.
- std: The number of standard deviations to use as the cutoff for outliers.
- samples: The number or fraction of samples to consider as outliers.
- keep: Whether to keep (true) or remove (false) the outliers. Defaults to false.
You must specify either "std" or "samples" in the outliers configuration, but not both.
Example:
```yaml
- name: remove-worst-10
type: sample
outliers:
embedding_keys:
- concept
- description
samples: 0.9
```
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ nav:
- Split: operators/split.md
- Gather: operators/gather.md
- Unnest: operators/unnest.md
- Sample: operators/sample.md
- Optimization:
- Overview: optimization/overview.md
- Example: optimization/example.md
Expand Down
Loading

0 comments on commit 38a073f

Please sign in to comment.