Skip to content

Commit

Permalink
API: allow more configuration of adaptive round robin (#113)
Browse files Browse the repository at this point in the history
* API: Add n-top
* DOC MAINT: show validation sampling in docs
  • Loading branch information
stsievert authored Jul 15, 2021
1 parent 1deb480 commit 6686eae
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 35 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
name: Documentation build

on:
release:
types: [published]
on: push
# on:
# release:
# types: [published]

# Only run when release published (not created or edited, etc)
# https://docs.github.com/en/actions/reference/events-that-trigger-workflows#release
Expand Down
19 changes: 10 additions & 9 deletions docs/source/_static/alieneggs.html
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
<div>
Redirecting to <a href="http://ec2-44-242-160-78.us-west-2.compute.amazonaws.com:8421">http://ec2-44-242-160-78.us-west-2.compute.amazonaws.com:8421</a>...
</div>

<script>
var redirect = ["http://ec2-44-226-205-171.us-west-2.compute.amazonaws.com:8421/", "http://ec2-44-242-160-78.us-west-2.compute.amazonaws.com:8421/"];

window.location.href = redirect[Math.floor(Math.random() * 2)]
</script>
<!DOCTYPE html>
<html>
<body>
<script>
var urls = ["http://18.237.56.89:8421", "http://52.38.73.113:8421"];
window.location.href = urls[Math.floor(Math.random() * urls.length)]
</script>
<p>Please enable Javascript for to be randomly redirected.</p>
</body>
</html>
1 change: 1 addition & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ Passive Algorithms

salmon.triplets.samplers.RandomSampling
salmon.triplets.samplers.RoundRobin
salmon.triplets.samplers.Validation

Active Algorithms
^^^^^^^^^^^^^^^^^
Expand Down
46 changes: 29 additions & 17 deletions salmon/triplets/samplers/_adaptive_runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ def __init__(

self.n_search = kwargs.pop("n_search", 0)


Opt = getattr(adaptive, optimizer)
Module = getattr(adaptive, module)

Expand Down Expand Up @@ -125,10 +124,6 @@ def get_query(self) -> Tuple[Optional[Dict[str, int]], Optional[float]]:

def get_queries(self, num=None, stop=None) -> Tuple[List[Query], List[float], dict]:
"""Get and score many queries."""
if num or self.n_search:
n_ret = int(num or self.n_search)
queries, scores = self.search.score(num=n_ret)
return queries[:n_ret], scores[:n_ret], {}
ret_queries = []
ret_scores = []
n_searched = 0
Expand All @@ -148,6 +143,11 @@ def get_queries(self, num=None, stop=None) -> Tuple[List[Query], List[float], di
# let's limit it to be 32MB in size
if (n_searched >= 2e6) or (stop is not None and stop.is_set()):
break
if num or self.n_search:
n_ret = int(num or self.n_search)
if n_searched >= 3 * n_ret:
break

queries = np.concatenate(ret_queries).astype(int)
scores = np.concatenate(ret_scores)
queries = self._sort_query_order(queries)
Expand All @@ -156,7 +156,13 @@ def get_queries(self, num=None, stop=None) -> Tuple[List[Query], List[float], di
df = pd.DataFrame(queries)
hashes = pd.util.hash_pandas_object(df, index=False)
_, idx = np.unique(hashes.to_numpy(), return_index=True)
return queries[idx], scores[idx], {}
queries = queries[idx]
scores = scores[idx]
if num or self.n_search:
n_ret = int(num or self.n_search)
queries = queries[:n_ret]
scores = scores[:n_ret]
return queries, scores, {}

@staticmethod
def _sort_query_order(queries: np.ndarray) -> np.ndarray:
Expand Down Expand Up @@ -334,35 +340,40 @@ def __init__(self, alpha=1, **kwargs):
class ARR(Adaptive):
"""A randomized round robin algorithm.
In practice, this sampling algorithm randomly asks about high scoring
queries for each head.
Notes
-----
This algorithm is proposed in [1]_. They propose this algorithm because
"scoring every triplet is prohibitvely expensive." It's also useful because it adds some randomness to the queries. This presents itself in a couple use cases:
This algorithms asks about "high scoring queries" uniformly at random. For
each head, the top ``n_top`` queries are selected. The query shown to the
user is a query selected uniformly at random from this set.
* When models don't update instantly (common). In that case, the user will
query the database for multiple queries, and queries with the same head
object may be returned.
* When the noise model does not precisely model the human responses. In
this case, the most informative query will
This algorithm is proposed because "scoring every triplet is prohibitvely
expensive." It's perhaps more useful with Salmon's complete search
because adds some randomness to the query shown to the user.
References
----------
.. [1] Heim, Eric, et al. "Active perceptual similarity modeling withi
.. [1] Heim, Eric, et al. "Active perceptual similarity modeling with
auxiliary information." arXiv preprint arXiv:1511.02254 (2015). https://arxiv.org/abs/1511.02254
"""

def __init__(self, R: int = 1, module="TSTE", **kwargs):
def __init__(self, R: int = 1, n_top=3, module="TSTE", **kwargs):
"""
Parameters
----------
R: int = 1
R: int (optional, default ``1``)
Adaptive sampling starts are ``R * n`` response have been received.
module : str, optional (default ``"TSTE"``).
The noise model to use.
n_top : int (optional, default ``3``)
For each head, the number of top-scoring queries to ask about.
kwargs : dict
Keyword arguments to pass to :class:`~salmon.triplets.samplers.Adaptive`.
"""
self.n_top = n_top
super().__init__(R=R, module=module, **kwargs)

def get_queries(self, *args, **kwargs):
Expand All @@ -373,11 +384,12 @@ def get_queries(self, *args, **kwargs):
df["score"] = scores

# Find the top scores per head
top_scores_by_head = df.groupby(by="h")["score"].nlargest(n=3)
top_scores_by_head = df.groupby(by="h")["score"].nlargest(n=self.n_top)
top_idx = top_scores_by_head.index.droplevel(0)

top_queries = df.loc[top_idx]
top_scores = top_queries["score"].to_numpy()
top_queries = top_queries.sample(frac=1, replace=False)

posted = top_queries[["h", "l", "r"]].to_numpy().astype("int64")
r_scores = np.random.uniform(low=10, high=11, size=len(posted))
Expand Down
1 change: 1 addition & 0 deletions salmon/triplets/samplers/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@


class Validation(RoundRobin):
"""Ask about the same queries repeatedly"""
def __init__(self, n, d=2, n_queries=20, ident=""):
"""
This sampler asks the same questions repeatedly, useful to evaluate
Expand Down
4 changes: 2 additions & 2 deletions templates/dashboard.html
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,8 @@ <h3>Queries</h3>
</p>
<p style="width: 600px;">
Also, algorithm <tt>ARR</tt> posts <tt>3*n</tt> of the highest scoring
queries to the database when <tt>n</tt> is the number of targets.
The label "scored_(complete)" records this value.
queries to the database by default when <tt>n</tt> is the number of
targets. The label "scored_(complete)" records this value.
</p>
</div>
<div class="row justify-content-center">
Expand Down
13 changes: 9 additions & 4 deletions templates/query_page.html
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,7 @@
var response_time = getTime() - response_start;
num_queries = num_queries + 1;
$("#num-queries").html("" + num_queries);
$.post("/answer",
JSON.stringify({
var data = {
"head": head,
"left": left,
"right": right,
Expand All @@ -181,8 +180,14 @@
"network_latency": latency,
"score": score,
"alg_ident": ident,
})
);
};
$.ajax({
"url": "/answer",
"method": "POST",
"dataType": "json",
"contentType": "application/json",
"data": JSON.stringify(data),
});
}
}

Expand Down

0 comments on commit 6686eae

Please sign in to comment.