API: allow more configuration of adaptive round robin (#113)

* API: Add n-top * DOC MAINT: show validation sampling in docs
stsievert · Jul 15, 2021 · 6686eae · 6686eae
1 parent 1deb480
commit 6686eae
Show file tree

Hide file tree

Showing 7 changed files with 56 additions and 35 deletions.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -1,8 +1,9 @@
 name: Documentation build
 
-on:
-  release:
-    types: [published]
+on: push
+# on:
+  # release:
+    # types: [published]
 
 # Only run when release published (not created or edited, etc)
 # https://docs.github.com/en/actions/reference/events-that-trigger-workflows#release

diff --git a/docs/source/_static/alieneggs.html b/docs/source/_static/alieneggs.html
@@ -1,9 +1,10 @@
-<div>
-Redirecting to <a href="http://ec2-44-242-160-78.us-west-2.compute.amazonaws.com:8421">http://ec2-44-242-160-78.us-west-2.compute.amazonaws.com:8421</a>...
-</div>
-
-<script>
-var redirect = ["http://ec2-44-226-205-171.us-west-2.compute.amazonaws.com:8421/", "http://ec2-44-242-160-78.us-west-2.compute.amazonaws.com:8421/"];
-
-window.location.href = redirect[Math.floor(Math.random() * 2)]
-</script>
+<!DOCTYPE html>
+<html>
+  <body>
+    <script>
+    var urls = ["http://18.237.56.89:8421", "http://52.38.73.113:8421"];
+    window.location.href = urls[Math.floor(Math.random() * urls.length)]
+    </script>
+    <p>Please enable Javascript for to be randomly redirected.</p>
+  </body>
+</html>
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -49,6 +49,7 @@ Passive Algorithms
 
    salmon.triplets.samplers.RandomSampling
    salmon.triplets.samplers.RoundRobin
+   salmon.triplets.samplers.Validation
 
 Active Algorithms
 ^^^^^^^^^^^^^^^^^

diff --git a/salmon/triplets/samplers/_adaptive_runners.py b/salmon/triplets/samplers/_adaptive_runners.py
@@ -73,7 +73,6 @@ def __init__(
 
         self.n_search = kwargs.pop("n_search", 0)
 
-
         Opt = getattr(adaptive, optimizer)
         Module = getattr(adaptive, module)
 
@@ -125,10 +124,6 @@ def get_query(self) -> Tuple[Optional[Dict[str, int]], Optional[float]]:
 
     def get_queries(self, num=None, stop=None) -> Tuple[List[Query], List[float], dict]:
         """Get and score many queries."""
-        if num or self.n_search:
-            n_ret = int(num or self.n_search)
-            queries, scores = self.search.score(num=n_ret)
-            return queries[:n_ret], scores[:n_ret], {}
         ret_queries = []
         ret_scores = []
         n_searched = 0
@@ -148,6 +143,11 @@ def get_queries(self, num=None, stop=None) -> Tuple[List[Query], List[float], di
             # let's limit it to be 32MB in size
             if (n_searched >= 2e6) or (stop is not None and stop.is_set()):
                 break
+            if num or self.n_search:
+                n_ret = int(num or self.n_search)
+                if n_searched >= 3 * n_ret:
+                    break
+
         queries = np.concatenate(ret_queries).astype(int)
         scores = np.concatenate(ret_scores)
         queries = self._sort_query_order(queries)
@@ -156,7 +156,13 @@ def get_queries(self, num=None, stop=None) -> Tuple[List[Query], List[float], di
         df = pd.DataFrame(queries)
         hashes = pd.util.hash_pandas_object(df, index=False)
         _, idx = np.unique(hashes.to_numpy(), return_index=True)
-        return queries[idx], scores[idx], {}
+        queries = queries[idx]
+        scores = scores[idx]
+        if num or self.n_search:
+            n_ret = int(num or self.n_search)
+            queries = queries[:n_ret]
+            scores = scores[:n_ret]
+        return queries, scores, {}
 
     @staticmethod
     def _sort_query_order(queries: np.ndarray) -> np.ndarray:
@@ -334,35 +340,40 @@ def __init__(self, alpha=1, **kwargs):
 class ARR(Adaptive):
     """A randomized round robin algorithm.
 
+    In practice, this sampling algorithm randomly asks about high scoring
+    queries for each head.
+
     Notes
     -----
-    This algorithm is proposed in [1]_. They propose this algorithm because
-    "scoring every triplet is prohibitvely expensive." It's also useful because it adds some randomness to the queries. This presents itself in a couple use cases:
+    This algorithms asks about "high scoring queries" uniformly at random. For
+    each head, the top ``n_top`` queries are selected. The query shown to the
+    user is a query selected uniformly at random from this set.
 
-    * When models don't update instantly (common). In that case, the user will
-      query the database for multiple queries, and queries with the same head
-      object may be returned.
-    * When the noise model does not precisely model the human responses. In
-      this case, the most informative query will
+    This algorithm is proposed because "scoring every triplet is prohibitvely
+    expensive." It's perhaps more useful with Salmon's complete search
+    because adds some randomness to the query shown to the user.
 
     References
     ----------
-    .. [1] Heim, Eric, et al. "Active perceptual similarity modeling withi
+    .. [1] Heim, Eric, et al. "Active perceptual similarity modeling with
            auxiliary information." arXiv preprint arXiv:1511.02254 (2015). https://arxiv.org/abs/1511.02254
 
     """
 
-    def __init__(self, R: int = 1, module="TSTE", **kwargs):
+    def __init__(self, R: int = 1, n_top=3, module="TSTE", **kwargs):
         """
         Parameters
         ----------
-        R: int = 1
+        R: int (optional, default ``1``)
             Adaptive sampling starts are ``R * n`` response have been received.
         module : str, optional (default ``"TSTE"``).
             The noise model to use.
+        n_top : int (optional, default ``3``)
+            For each head, the number of top-scoring queries to ask about.
         kwargs : dict
             Keyword arguments to pass to :class:`~salmon.triplets.samplers.Adaptive`.
         """
+        self.n_top = n_top
         super().__init__(R=R, module=module, **kwargs)
 
     def get_queries(self, *args, **kwargs):
@@ -373,11 +384,12 @@ def get_queries(self, *args, **kwargs):
         df["score"] = scores
 
         # Find the top scores per head
-        top_scores_by_head = df.groupby(by="h")["score"].nlargest(n=3)
+        top_scores_by_head = df.groupby(by="h")["score"].nlargest(n=self.n_top)
         top_idx = top_scores_by_head.index.droplevel(0)
 
         top_queries = df.loc[top_idx]
         top_scores = top_queries["score"].to_numpy()
+        top_queries = top_queries.sample(frac=1, replace=False)
 
         posted = top_queries[["h", "l", "r"]].to_numpy().astype("int64")
         r_scores = np.random.uniform(low=10, high=11, size=len(posted))

diff --git a/salmon/triplets/samplers/_validation.py b/salmon/triplets/samplers/_validation.py
@@ -10,6 +10,7 @@
 
 
 class Validation(RoundRobin):
+    """Ask about the same queries repeatedly"""
     def __init__(self, n, d=2, n_queries=20, ident=""):
         """
         This sampler asks the same questions repeatedly, useful to evaluate

diff --git a/templates/dashboard.html b/templates/dashboard.html
@@ -316,8 +316,8 @@ <h3>Queries</h3>
       </p>
       <p style="width: 600px;">
       Also, algorithm <tt>ARR</tt> posts <tt>3*n</tt> of the highest scoring
-      queries to the database when <tt>n</tt> is the number of targets.
-      The label "scored_(complete)" records this value.
+      queries to the database by default when <tt>n</tt> is the number of
+      targets. The label "scored_(complete)" records this value.
       </p>
     </div>
     <div class="row justify-content-center">

diff --git a/templates/query_page.html b/templates/query_page.html
@@ -170,8 +170,7 @@
     var response_time = getTime() - response_start;
     num_queries = num_queries + 1;
     $("#num-queries").html("" + num_queries);
-    $.post("/answer",
-      JSON.stringify({
+    var data = {
           "head": head,
           "left": left,
           "right": right,
@@ -181,8 +180,14 @@
           "network_latency": latency,
           "score": score,
           "alg_ident": ident,
-      })
-    );
+    };
+    $.ajax({
+      "url": "/answer",
+      "method": "POST",
+      "dataType": "json",
+      "contentType": "application/json",
+      "data": JSON.stringify(data),
+    });
   }
 }