Merge pull request #27 from googleinterns/test_and_format

Test and format
googleinterns · Jul 30, 2021 · 346f3cc · 346f3cc
2 parents e899042 + 76962d1
commit 346f3cc
Show file tree

Hide file tree

Showing 7 changed files with 240 additions and 49 deletions.
diff --git a/data/build_probe_1_data.py b/data/build_probe_1_data.py
@@ -35,6 +35,9 @@
 flags.DEFINE_integer("probe_min_pop", 30, "minimum popularity to be in probe")
 flags.DEFINE_integer("popular_min_pop", 138, "minimum popularity to be"
                      + " considered a popular movie")
+flags.DEFINE_enum("format", "normal", ["normal", "sequences"],
+                  "specify the probe format: normal for pairs in dialogue, "
+                  + "sequences for movie only probes for sequences task")
 
 
 def create_pmi(co_matrix, movie_ids):
@@ -242,10 +245,16 @@ def parse_sequence(sequence_str):
       random_list = random.sample(popular_movies, k=10)
 
       for related, rand in zip(related_list, random_list):
-        prompt = f"[User] Can you recommend me a movie like @ {movie} @"
-        probes.append(f"{prompt}\tSure, have you seen @ {related} @?")
-        probes.append(f"{prompt}\tSure, have you seen @ {rand} @?")
-        probe_1_path = constants.PROBE_1_TSV_PATH["validation"]
+        if FLAGS.format == "sequences":
+          probes.append(f"@ {movie} @\t{related}")
+          probes.append(f"@ {movie} @\t{rand}")
+          path, extension = constants.PROBE_1_TSV_PATH["validation"].split(".")
+          probe_1_path = path + "_sequences" + "." + extension
+        else:
+          prompt = f"[User] Can you recommend me a movie like @ {movie} @"
+          probes.append(f"{prompt}\tSure, have you seen @ {related} @?")
+          probes.append(f"{prompt}\tSure, have you seen @ {rand} @?")
+          probe_1_path = constants.PROBE_1_TSV_PATH["validation"]
 
     logging.info("%d pairs generated", len(probes))
     with tf.io.gfile.GFile(probe_1_path, "w") as f:

diff --git a/data/visualize_popularities.py b/data/visualize_popularities.py
@@ -0,0 +1,144 @@
+# Copyright 2020 Google LLC
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     https://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Visualize the popularity bias for a given model's Probe 1."""
+
+import json
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import plotly.graph_objects as go
+import tensorflow.compat.v1 as tf
+from trainer import constants
+
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_enum("size", "base", ["small", "base", "large", "3B", "11B"],
+                  "model size")
+flags.DEFINE_string("name", "default", "name/description of model version")
+flags.DEFINE_string("subfolder", None, ("subfolder under size folder to put ",
+                                        "model in. if None, the model folder",
+                                        " will be in bucket/models/size"))
+
+
+def tf_load_txt(filepath):
+  """Load newline separated text from gs:// using tf.io.
+
+  Args:
+    filepath: path of the file to be read
+
+  Returns:
+    a list of strings contining the lines of the file
+  """
+  with tf.io.gfile.GFile(filepath, "r") as txt_file:
+    data = []
+    for row in list(txt_file):
+      data.append(str(row.replace("\n", "")))
+  return data
+
+
+def load_probe_data(model_dir, probe):
+  """Load the probe data of a given model.
+
+  Args:
+    model_dir: the directory of a given model
+    probe: the name of the probe
+
+  Returns:
+    a tuple containing the inputs, targets, predictions and steps
+  """
+  eval_path = os.path.join(model_dir, "validation_eval")
+  inputs = [x[2:-1] for x in tf_load_txt(os.path.join(eval_path,
+                                                      f"{probe}_inputs"))]
+  targets = tf_load_txt(os.path.join(eval_path, f"{probe}_targets"))
+  prediction_path = os.path.join(eval_path, f"{probe}*_predictions")
+  prediction_files = sorted(tf.io.gfile.glob(prediction_path),
+                            key=lambda x: int(x.split("_")[-2]))
+  predictions = []
+  steps = []
+
+  for pred_file in prediction_files:
+    ckpt_step = int(pred_file.split("_")[-2])
+    steps.append(ckpt_step)
+    predictions.append(tf_load_txt(pred_file))
+
+  return inputs, targets, predictions, steps
+
+
+def main(_):
+
+  # set the model dir
+  model_dir = os.path.join(constants.MODELS_DIR, FLAGS.size)
+  if FLAGS.subfolder is not None:
+    model_dir = os.path.join(model_dir, FLAGS.subfolder)
+  model_dir = os.path.join(model_dir, FLAGS.name)
+
+  # load the popularity data
+  with tf.io.gfile.GFile(constants.MATRIX_PATHS["movie_ids"], "r") as f:
+    movie_ids = json.load(f)
+
+  # load the probe 1 data for the given model
+  inputs, targets, predictions, steps = load_probe_data(model_dir, "probe_1")
+  predictions = predictions[-1]
+  steps = steps[-1]
+  movie_ids["popularity"] = {k.lower(): v for k, v
+                             in movie_ids["popularity"].items()}
+
+  # keep track of the correctly and incorrectly classified pairs
+  correct = []
+  incorrect = []
+
+  pairs = [(i, i+1) for i in range(0, len(predictions), 2)]
+  for i1, i2 in pairs:
+    query = inputs[i1].split("@")[1].strip()
+    related = targets[i1].split("@")[1].strip()
+    random = targets[i2].split("@")[1].strip()
+    if (related in movie_ids["popularity"] and random in movie_ids["popularity"]
+        and query in movie_ids["popularity"]):
+      if float(predictions[i1]) >= float(predictions[i2]):
+        correct.append((query, related, random))
+      else:
+        incorrect.append((query, related, random))
+
+  correct_popularities = [movie_ids["popularity"][x[0]] for x in correct]
+  incorrect_popularities = [movie_ids["popularity"][x[0]] for x in incorrect]
+
+  # plot the correctly and incorrectly classified pairs on a histogram
+  fig = go.Figure()
+  fig.add_trace(go.Histogram(x=correct_popularities, name="correct"))
+  fig.add_trace(go.Histogram(x=incorrect_popularities, name="incorrect"))
+
+  fig.update_layout(barmode="overlay")
+  fig.update_traces(opacity=0.5)
+  fig.update_layout(
+      title="Correct vs Incorrect Popularity Distributions",
+      xaxis_title="Popularity",
+      yaxis_title="Frequency"
+  )
+  fig.show()
+
+  # log mean/median differences
+  logging.info("Correct ----------")
+  logging.info("mean: %d median %d", np.mean(correct_popularities),
+               np.median(correct_popularities))
+  logging.info("Incorrect ----------")
+  logging.info("mean: %d median %d", np.mean(incorrect_popularities),
+               np.median(incorrect_popularities))
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/test/test_build_movielens.py b/test/test_build_movielens.py
@@ -12,33 +12,56 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Unit Tests for E2E Convrec modules"""
-from data import build_movielens
+"""Unit Tests for E2E Convrec modules."""
 import unittest
 
+from data import build_movielens
+
+
 class TestBuildMovielens(unittest.TestCase):
 
-    def test_flip_titles(self):
-        test_inputs = [
-            "Green Mile, The (1999) @ Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966) @ Devil's Advocate, The (1997) ",
-            "King's Speech, The (2010) @ Social Network, The (2010) @ Catch Me If You Can (2002)",
-            "Brady Bunch Movie, The (1995) @ Shining, The (1980) @ Cool Hand Luke (1967)",
-            "House Bunny, The (2008)",
-            "Ten Commandments, The (1956)",
-            "Fake Movie, the (subtitle) weirdness, (0000)"
-        ]
-
-        test_outputs = [
-            "The Green Mile (1999) @ The Good, the Bad and the Ugly (Buono, il brutto, il cattivo, Il) (1966) @ The Devil's Advocate (1997)",
-            "The King's Speech (2010) @ The Social Network (2010) @ Catch Me If You Can (2002)",
-            "The Brady Bunch Movie (1995) @ The Shining (1980) @ Cool Hand Luke (1967)",
-            "The House Bunny (2008)",
-            "The Ten Commandments (1956)",
-            "the Fake Movie (subtitle) weirdness, (0000)"
-        ]
-        for test_input, test_output in zip(test_inputs, test_outputs):
-            print(build_movielens.flip_titles(test_input), test_output)
-            self.assertEqual(build_movielens.flip_titles(test_input), test_output, "should put title in order")
-
-if __name__ == '__main__':
-    unittest.main()
+  def test_flip_titles(self):
+    test_inputs = [
+        "Green Mile, The (1999) @ Good, the Bad and the Ugly, The (Buono, il brut"
+        + "to, il cattivo, Il) (1966) @ Devil's Advocate, The (1997) ",
+        "King's Speech, The (2010) @ Social Network, The (2010) @ Catch Me If You"
+        + " Can (2002)",
+        "Brady Bunch Movie, The (1995) @ Shining, The (1980) @ Cool Hand Luke (19"
+        + "67)", "House Bunny, The (2008)", "Ten Commandments, The (1956)",
+        "Fake Movie, the (subtitle) weirdness, (0000)"
+    ]
+
+    test_outputs = [
+        "The Green Mile (1999) @ The Good, the Bad and the Ugly (Buono, il brutto"
+        + ", il cattivo, Il) (1966) @ The Devil's Advocate (1997)",
+        "The King's Speech (2010) @ The Social Network (2010) @ Catch Me If You "
+        + "Can (2002)",
+        "The Brady Bunch Movie (1995) @ The Shining (1980) @ Cool Hand Luke "
+        + "(1967)",
+        "The House Bunny (2008)",
+        "The Ten Commandments (1956)",
+        "the Fake Movie (subtitle) weirdness, (0000)"
+    ]
+    for test_input, test_output in zip(test_inputs, test_outputs):
+      self.assertEqual(build_movielens.flip_titles(test_input), test_output,
+                       "should put title in order")
+
+  def test_parse_sequence(self):
+    test_sequences = [
+        "(1, [1, 2])",
+        "(2, [3, 4]",
+        "(3, []",
+        "(4, [1, 1])"
+    ]
+    expected_parsed = [
+        [1, 2],
+        [3, 4],
+        [],
+        [1, 1]
+    ]
+    for test_seq, ex_parsed in zip(test_sequences, expected_parsed):
+      self.assertEqual(build_movielens.parse_user_seq(test_seq),
+                       ex_parsed, "incorrect string -> list parsing")
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/test/test_build_probes.py b/test/test_build_probes.py
@@ -95,5 +95,31 @@ def calc_pmi2(co_ab, pop_a, pop_b, num_seq):
     self.assertTrue(np.allclose(np.exp(pmi2), np.exp(expected_pmi2)),
                     "pmi2 calculation incorrect")
 
+  def test_get_related_movies(self):
+    sequences = [
+        ["a", "b", "c"],
+        ["b", "c", "d"],
+        ["c"]
+    ]
+    movie_ids = build_probe_1_data.create_movie_ids(sequences)
+    co = build_probe_1_data.create_cooccurrence(sequences, movie_ids)
+    pmi2 = build_probe_1_data.create_pmi(co, movie_ids)
+    all_movies = movie_ids["all_movies"]
+    movie_ids["id_to_movie"] = dict(zip([str(x) for x in
+                                         range(len(all_movies))], all_movies))
+    filtered_set = set(movie_ids["all_movies"])
+
+    expected_related = {
+        "a": ["b", "c", "d"],
+        "b": ["c", "d", "a"],
+        "c": ["b", "d", "a"],
+        "d": ["b", "c", "a"]
+    }
+
+    for movie, ex_related in expected_related.items():
+      related = build_probe_1_data.get_related_movies(movie, movie_ids, pmi2,
+                                                      filtered_set, k=3)
+      self.assertEqual(", ".join(ex_related), ", ".join(related))
+
 if __name__ == "__main__":
   unittest.main()
diff --git a/trainer/constants.py b/trainer/constants.py
@@ -53,7 +53,7 @@
 PROBE_1_TSV_PATH = {
     "validation": os.path.join(PROBE_DIR, "probe_1.tsv")
 }
-PROBE_1_SEQ_PATH = {
+PROBE_1_SEQ_TSV_PATH = {
     "validation": os.path.join(PROBE_DIR, "probe_1_sequences.tsv")
 }
 PROBE_2_TSV_PATH = {

diff --git a/trainer/finetune.py b/trainer/finetune.py
@@ -50,6 +50,7 @@
                                             "a step number or -1 for latest"))
 flags.DEFINE_enum("tags_version", "normal", ["normal", "reversed", "masked"],
                   "version of the tags dataset: normal, reversed, or masked")
+flags.DEFINE_integer("eval_start", 999900, "step at which to start eval")
 flags.DEFINE_integer("beam_size", 1, "beam size for saved model")
 flags.DEFINE_float("temperature", 1.0, "temperature for saved model")
 flags.DEFINE_float("learning_rate", .003, "learning rate for finetuning")
@@ -107,8 +108,6 @@ def main(_):
         # Supply a function which preprocesses text from the tf.data.Dataset.
         text_preprocessor=[
             preprocessing.preprocessor_wrapper("rd_recommendations")],
-        # Use the same vocabulary that we used for pre-training.
-        # sentencepiece_model_path=t5.data.DEFAULT_SPM_PATH,
         # Lowercase targets before computing metrics.
         postprocess_fn=t5.data.postprocessors.lower_text,
         # We'll use bleu, bleu no titles, and recall as our evaluation metrics.
@@ -124,11 +123,9 @@ def main(_):
         splits=["train", "validation"],
         # Supply a function which preprocesses text from the tf.data.Dataset.
         text_preprocessor=[preprocessing.preprocessor_wrapper("ml_sequences")],
-        # Use the same vocabulary that we used for pre-training.
-        # sentencepiece_model_path=t5.data.DEFAULT_SPM_PATH,
         # Lowercase targets before computing metrics.
         postprocess_fn=t5.data.postprocessors.lower_text,
-        # We'll use accuracy/recall as our evaluation metric.
+        # We'll use accuracy as our evaluation metric.
         metric_fns=[t5.evaluation.metrics.accuracy])
 
   # set up the ml-tags task (training on movielens tags and genres)
@@ -141,11 +138,9 @@ def main(_):
         splits=["train", "validation"],
         # Supply a function which preprocesses text from the tf.data.Dataset.
         text_preprocessor=[preprocessing.preprocessor_wrapper("ml_tags")],
-        # Use the same vocabulary that we used for pre-training.
-        # sentencepiece_model_path=t5.data.DEFAULT_SPM_PATH,
         # Lowercase targets before computing metrics.
         postprocess_fn=t5.data.postprocessors.lower_text,
-        # We'll use accuracy/recall and bleu as our evaluation metrics.
+        # We'll use accuracy as our evaluation metric.
         metric_fns=[t5.evaluation.metrics.accuracy])
 
   # set up the ml-reviews task (training on movielens movies with imdb reviews)
@@ -157,31 +152,25 @@ def main(_):
         splits=["train", "validation"],
         # Supply a function which preprocesses text from the tf.data.Dataset.
         text_preprocessor=[preprocessing.preprocessor_wrapper("ml_reviews")],
-        # Use the same vocabulary that we used for pre-training.
-        # sentencepiece_model_path=t5.data.DEFAULT_SPM_PATH,
         # Lowercase targets before computing metrics.
         postprocess_fn=t5.data.postprocessors.lower_text,
-        # We'll use accuracy/recall and bleu as our evaluation metrics.
+        # We'll use bleu as our evaluation metric.
         metric_fns=[metrics.t2t_bleu])
 
   if "probe" in FLAGS.mode:
     if "sequences" in FLAGS.mode:
       t5.data.TaskRegistry.add(
         FLAGS.mode,
-        # Supply a function which returns a tf.data.Dataset.
         dataset_fn=preprocessing.dataset_fn_wrapper(FLAGS.mode),
         splits=["validation"],
-        # Supply a function which preprocesses text from the tf.data.Dataset.
         text_preprocessor=[
             preprocessing.preprocessor_wrapper("ml_sequences")],
         metric_fns=[metrics.probe_pair_accuracy])
     else:
       t5.data.TaskRegistry.add(
           FLAGS.mode,
-          # Supply a function which returns a tf.data.Dataset.
           dataset_fn=preprocessing.dataset_fn_wrapper(FLAGS.mode),
           splits=["validation"],
-          # Supply a function which preprocesses text from the tf.data.Dataset.
           text_preprocessor=[
               preprocessing.preprocessor_wrapper("rd_recommendations")],
           metric_fns=[metrics.probe_pair_accuracy])
@@ -239,14 +228,14 @@ def main(_):
     model.batch_size = train_batch_size * 8
     model.eval(
         mixture_or_task_name=FLAGS.task,
-        checkpoint_steps=list(range(999900, 999901+FLAGS.steps, 2000)),
+        checkpoint_steps=list(range(FLAGS.eval_start, 999901 + FLAGS.steps, 2000)),
         compute_sequence_length=False
     )
 
   if "probe" in FLAGS.mode:
     model.batch_size = train_batch_size * 8
 
-    for steps in range(999900, 999901+FLAGS.steps, 2000):
+    for steps in range(FLAGS.eval_start, 999901 + FLAGS.steps, 2000):
       model.eval(
           mixture_or_task_name=FLAGS.mode,
           checkpoint_steps=steps,