diff --git a/data/build_probe_1_data.py b/data/build_probe_1_data.py index 9ae0643..f4380f3 100644 --- a/data/build_probe_1_data.py +++ b/data/build_probe_1_data.py @@ -35,6 +35,9 @@ flags.DEFINE_integer("probe_min_pop", 30, "minimum popularity to be in probe") flags.DEFINE_integer("popular_min_pop", 138, "minimum popularity to be" + " considered a popular movie") +flags.DEFINE_enum("format", "normal", ["normal", "sequences"], + "specify the probe format: normal for pairs in dialogue, " + + "sequences for movie only probes for sequences task") def create_pmi(co_matrix, movie_ids): @@ -242,10 +245,16 @@ def parse_sequence(sequence_str): random_list = random.sample(popular_movies, k=10) for related, rand in zip(related_list, random_list): - prompt = f"[User] Can you recommend me a movie like @ {movie} @" - probes.append(f"{prompt}\tSure, have you seen @ {related} @?") - probes.append(f"{prompt}\tSure, have you seen @ {rand} @?") - probe_1_path = constants.PROBE_1_TSV_PATH["validation"] + if FLAGS.format == "sequences": + probes.append(f"@ {movie} @\t{related}") + probes.append(f"@ {movie} @\t{rand}") + path, extension = constants.PROBE_1_TSV_PATH["validation"].split(".") + probe_1_path = path + "_sequences" + "." + extension + else: + prompt = f"[User] Can you recommend me a movie like @ {movie} @" + probes.append(f"{prompt}\tSure, have you seen @ {related} @?") + probes.append(f"{prompt}\tSure, have you seen @ {rand} @?") + probe_1_path = constants.PROBE_1_TSV_PATH["validation"] logging.info("%d pairs generated", len(probes)) with tf.io.gfile.GFile(probe_1_path, "w") as f: diff --git a/data/visualize_popularities.py b/data/visualize_popularities.py new file mode 100644 index 0000000..21d86de --- /dev/null +++ b/data/visualize_popularities.py @@ -0,0 +1,144 @@ +# Copyright 2020 Google LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Visualize the popularity bias for a given model's Probe 1.""" + +import json +import os + +from absl import app +from absl import flags +from absl import logging +import numpy as np +import plotly.graph_objects as go +import tensorflow.compat.v1 as tf +from trainer import constants + + +FLAGS = flags.FLAGS + +flags.DEFINE_enum("size", "base", ["small", "base", "large", "3B", "11B"], + "model size") +flags.DEFINE_string("name", "default", "name/description of model version") +flags.DEFINE_string("subfolder", None, ("subfolder under size folder to put ", + "model in. if None, the model folder", + " will be in bucket/models/size")) + + +def tf_load_txt(filepath): + """Load newline separated text from gs:// using tf.io. + + Args: + filepath: path of the file to be read + + Returns: + a list of strings contining the lines of the file + """ + with tf.io.gfile.GFile(filepath, "r") as txt_file: + data = [] + for row in list(txt_file): + data.append(str(row.replace("\n", ""))) + return data + + +def load_probe_data(model_dir, probe): + """Load the probe data of a given model. + + Args: + model_dir: the directory of a given model + probe: the name of the probe + + Returns: + a tuple containing the inputs, targets, predictions and steps + """ + eval_path = os.path.join(model_dir, "validation_eval") + inputs = [x[2:-1] for x in tf_load_txt(os.path.join(eval_path, + f"{probe}_inputs"))] + targets = tf_load_txt(os.path.join(eval_path, f"{probe}_targets")) + prediction_path = os.path.join(eval_path, f"{probe}*_predictions") + prediction_files = sorted(tf.io.gfile.glob(prediction_path), + key=lambda x: int(x.split("_")[-2])) + predictions = [] + steps = [] + + for pred_file in prediction_files: + ckpt_step = int(pred_file.split("_")[-2]) + steps.append(ckpt_step) + predictions.append(tf_load_txt(pred_file)) + + return inputs, targets, predictions, steps + + +def main(_): + + # set the model dir + model_dir = os.path.join(constants.MODELS_DIR, FLAGS.size) + if FLAGS.subfolder is not None: + model_dir = os.path.join(model_dir, FLAGS.subfolder) + model_dir = os.path.join(model_dir, FLAGS.name) + + # load the popularity data + with tf.io.gfile.GFile(constants.MATRIX_PATHS["movie_ids"], "r") as f: + movie_ids = json.load(f) + + # load the probe 1 data for the given model + inputs, targets, predictions, steps = load_probe_data(model_dir, "probe_1") + predictions = predictions[-1] + steps = steps[-1] + movie_ids["popularity"] = {k.lower(): v for k, v + in movie_ids["popularity"].items()} + + # keep track of the correctly and incorrectly classified pairs + correct = [] + incorrect = [] + + pairs = [(i, i+1) for i in range(0, len(predictions), 2)] + for i1, i2 in pairs: + query = inputs[i1].split("@")[1].strip() + related = targets[i1].split("@")[1].strip() + random = targets[i2].split("@")[1].strip() + if (related in movie_ids["popularity"] and random in movie_ids["popularity"] + and query in movie_ids["popularity"]): + if float(predictions[i1]) >= float(predictions[i2]): + correct.append((query, related, random)) + else: + incorrect.append((query, related, random)) + + correct_popularities = [movie_ids["popularity"][x[0]] for x in correct] + incorrect_popularities = [movie_ids["popularity"][x[0]] for x in incorrect] + + # plot the correctly and incorrectly classified pairs on a histogram + fig = go.Figure() + fig.add_trace(go.Histogram(x=correct_popularities, name="correct")) + fig.add_trace(go.Histogram(x=incorrect_popularities, name="incorrect")) + + fig.update_layout(barmode="overlay") + fig.update_traces(opacity=0.5) + fig.update_layout( + title="Correct vs Incorrect Popularity Distributions", + xaxis_title="Popularity", + yaxis_title="Frequency" + ) + fig.show() + + # log mean/median differences + logging.info("Correct ----------") + logging.info("mean: %d median %d", np.mean(correct_popularities), + np.median(correct_popularities)) + logging.info("Incorrect ----------") + logging.info("mean: %d median %d", np.mean(incorrect_popularities), + np.median(incorrect_popularities)) + +if __name__ == "__main__": + app.run(main) diff --git a/test/test_build_movielens.py b/test/test_build_movielens.py index 0a80b59..9ab28cc 100644 --- a/test/test_build_movielens.py +++ b/test/test_build_movielens.py @@ -12,33 +12,56 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Unit Tests for E2E Convrec modules""" -from data import build_movielens +"""Unit Tests for E2E Convrec modules.""" import unittest +from data import build_movielens + + class TestBuildMovielens(unittest.TestCase): - def test_flip_titles(self): - test_inputs = [ - "Green Mile, The (1999) @ Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966) @ Devil's Advocate, The (1997) ", - "King's Speech, The (2010) @ Social Network, The (2010) @ Catch Me If You Can (2002)", - "Brady Bunch Movie, The (1995) @ Shining, The (1980) @ Cool Hand Luke (1967)", - "House Bunny, The (2008)", - "Ten Commandments, The (1956)", - "Fake Movie, the (subtitle) weirdness, (0000)" - ] - - test_outputs = [ - "The Green Mile (1999) @ The Good, the Bad and the Ugly (Buono, il brutto, il cattivo, Il) (1966) @ The Devil's Advocate (1997)", - "The King's Speech (2010) @ The Social Network (2010) @ Catch Me If You Can (2002)", - "The Brady Bunch Movie (1995) @ The Shining (1980) @ Cool Hand Luke (1967)", - "The House Bunny (2008)", - "The Ten Commandments (1956)", - "the Fake Movie (subtitle) weirdness, (0000)" - ] - for test_input, test_output in zip(test_inputs, test_outputs): - print(build_movielens.flip_titles(test_input), test_output) - self.assertEqual(build_movielens.flip_titles(test_input), test_output, "should put title in order") - -if __name__ == '__main__': - unittest.main() \ No newline at end of file + def test_flip_titles(self): + test_inputs = [ + "Green Mile, The (1999) @ Good, the Bad and the Ugly, The (Buono, il brut" + + "to, il cattivo, Il) (1966) @ Devil's Advocate, The (1997) ", + "King's Speech, The (2010) @ Social Network, The (2010) @ Catch Me If You" + + " Can (2002)", + "Brady Bunch Movie, The (1995) @ Shining, The (1980) @ Cool Hand Luke (19" + + "67)", "House Bunny, The (2008)", "Ten Commandments, The (1956)", + "Fake Movie, the (subtitle) weirdness, (0000)" + ] + + test_outputs = [ + "The Green Mile (1999) @ The Good, the Bad and the Ugly (Buono, il brutto" + + ", il cattivo, Il) (1966) @ The Devil's Advocate (1997)", + "The King's Speech (2010) @ The Social Network (2010) @ Catch Me If You " + + "Can (2002)", + "The Brady Bunch Movie (1995) @ The Shining (1980) @ Cool Hand Luke " + + "(1967)", + "The House Bunny (2008)", + "The Ten Commandments (1956)", + "the Fake Movie (subtitle) weirdness, (0000)" + ] + for test_input, test_output in zip(test_inputs, test_outputs): + self.assertEqual(build_movielens.flip_titles(test_input), test_output, + "should put title in order") + + def test_parse_sequence(self): + test_sequences = [ + "(1, [1, 2])", + "(2, [3, 4]", + "(3, []", + "(4, [1, 1])" + ] + expected_parsed = [ + [1, 2], + [3, 4], + [], + [1, 1] + ] + for test_seq, ex_parsed in zip(test_sequences, expected_parsed): + self.assertEqual(build_movielens.parse_user_seq(test_seq), + ex_parsed, "incorrect string -> list parsing") + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_build_probes.py b/test/test_build_probes.py index 6041166..6338e87 100644 --- a/test/test_build_probes.py +++ b/test/test_build_probes.py @@ -95,5 +95,31 @@ def calc_pmi2(co_ab, pop_a, pop_b, num_seq): self.assertTrue(np.allclose(np.exp(pmi2), np.exp(expected_pmi2)), "pmi2 calculation incorrect") + def test_get_related_movies(self): + sequences = [ + ["a", "b", "c"], + ["b", "c", "d"], + ["c"] + ] + movie_ids = build_probe_1_data.create_movie_ids(sequences) + co = build_probe_1_data.create_cooccurrence(sequences, movie_ids) + pmi2 = build_probe_1_data.create_pmi(co, movie_ids) + all_movies = movie_ids["all_movies"] + movie_ids["id_to_movie"] = dict(zip([str(x) for x in + range(len(all_movies))], all_movies)) + filtered_set = set(movie_ids["all_movies"]) + + expected_related = { + "a": ["b", "c", "d"], + "b": ["c", "d", "a"], + "c": ["b", "d", "a"], + "d": ["b", "c", "a"] + } + + for movie, ex_related in expected_related.items(): + related = build_probe_1_data.get_related_movies(movie, movie_ids, pmi2, + filtered_set, k=3) + self.assertEqual(", ".join(ex_related), ", ".join(related)) + if __name__ == "__main__": unittest.main() diff --git a/trainer/constants.py b/trainer/constants.py index 6db055f..e87a640 100644 --- a/trainer/constants.py +++ b/trainer/constants.py @@ -53,7 +53,7 @@ PROBE_1_TSV_PATH = { "validation": os.path.join(PROBE_DIR, "probe_1.tsv") } -PROBE_1_SEQ_PATH = { +PROBE_1_SEQ_TSV_PATH = { "validation": os.path.join(PROBE_DIR, "probe_1_sequences.tsv") } PROBE_2_TSV_PATH = { diff --git a/trainer/finetune.py b/trainer/finetune.py index 4062ad1..e3281e7 100644 --- a/trainer/finetune.py +++ b/trainer/finetune.py @@ -50,6 +50,7 @@ "a step number or -1 for latest")) flags.DEFINE_enum("tags_version", "normal", ["normal", "reversed", "masked"], "version of the tags dataset: normal, reversed, or masked") +flags.DEFINE_integer("eval_start", 999900, "step at which to start eval") flags.DEFINE_integer("beam_size", 1, "beam size for saved model") flags.DEFINE_float("temperature", 1.0, "temperature for saved model") flags.DEFINE_float("learning_rate", .003, "learning rate for finetuning") @@ -107,8 +108,6 @@ def main(_): # Supply a function which preprocesses text from the tf.data.Dataset. text_preprocessor=[ preprocessing.preprocessor_wrapper("rd_recommendations")], - # Use the same vocabulary that we used for pre-training. - # sentencepiece_model_path=t5.data.DEFAULT_SPM_PATH, # Lowercase targets before computing metrics. postprocess_fn=t5.data.postprocessors.lower_text, # We'll use bleu, bleu no titles, and recall as our evaluation metrics. @@ -124,11 +123,9 @@ def main(_): splits=["train", "validation"], # Supply a function which preprocesses text from the tf.data.Dataset. text_preprocessor=[preprocessing.preprocessor_wrapper("ml_sequences")], - # Use the same vocabulary that we used for pre-training. - # sentencepiece_model_path=t5.data.DEFAULT_SPM_PATH, # Lowercase targets before computing metrics. postprocess_fn=t5.data.postprocessors.lower_text, - # We'll use accuracy/recall as our evaluation metric. + # We'll use accuracy as our evaluation metric. metric_fns=[t5.evaluation.metrics.accuracy]) # set up the ml-tags task (training on movielens tags and genres) @@ -141,11 +138,9 @@ def main(_): splits=["train", "validation"], # Supply a function which preprocesses text from the tf.data.Dataset. text_preprocessor=[preprocessing.preprocessor_wrapper("ml_tags")], - # Use the same vocabulary that we used for pre-training. - # sentencepiece_model_path=t5.data.DEFAULT_SPM_PATH, # Lowercase targets before computing metrics. postprocess_fn=t5.data.postprocessors.lower_text, - # We'll use accuracy/recall and bleu as our evaluation metrics. + # We'll use accuracy as our evaluation metric. metric_fns=[t5.evaluation.metrics.accuracy]) # set up the ml-reviews task (training on movielens movies with imdb reviews) @@ -157,31 +152,25 @@ def main(_): splits=["train", "validation"], # Supply a function which preprocesses text from the tf.data.Dataset. text_preprocessor=[preprocessing.preprocessor_wrapper("ml_reviews")], - # Use the same vocabulary that we used for pre-training. - # sentencepiece_model_path=t5.data.DEFAULT_SPM_PATH, # Lowercase targets before computing metrics. postprocess_fn=t5.data.postprocessors.lower_text, - # We'll use accuracy/recall and bleu as our evaluation metrics. + # We'll use bleu as our evaluation metric. metric_fns=[metrics.t2t_bleu]) if "probe" in FLAGS.mode: if "sequences" in FLAGS.mode: t5.data.TaskRegistry.add( FLAGS.mode, - # Supply a function which returns a tf.data.Dataset. dataset_fn=preprocessing.dataset_fn_wrapper(FLAGS.mode), splits=["validation"], - # Supply a function which preprocesses text from the tf.data.Dataset. text_preprocessor=[ preprocessing.preprocessor_wrapper("ml_sequences")], metric_fns=[metrics.probe_pair_accuracy]) else: t5.data.TaskRegistry.add( FLAGS.mode, - # Supply a function which returns a tf.data.Dataset. dataset_fn=preprocessing.dataset_fn_wrapper(FLAGS.mode), splits=["validation"], - # Supply a function which preprocesses text from the tf.data.Dataset. text_preprocessor=[ preprocessing.preprocessor_wrapper("rd_recommendations")], metric_fns=[metrics.probe_pair_accuracy]) @@ -239,14 +228,14 @@ def main(_): model.batch_size = train_batch_size * 8 model.eval( mixture_or_task_name=FLAGS.task, - checkpoint_steps=list(range(999900, 999901+FLAGS.steps, 2000)), + checkpoint_steps=list(range(FLAGS.eval_start, 999901 + FLAGS.steps, 2000)), compute_sequence_length=False ) if "probe" in FLAGS.mode: model.batch_size = train_batch_size * 8 - for steps in range(999900, 999901+FLAGS.steps, 2000): + for steps in range(FLAGS.eval_start, 999901 + FLAGS.steps, 2000): model.eval( mixture_or_task_name=FLAGS.mode, checkpoint_steps=steps, diff --git a/trainer/preprocessing.py b/trainer/preprocessing.py index 068dec5..8096dc8 100644 --- a/trainer/preprocessing.py +++ b/trainer/preprocessing.py @@ -112,7 +112,7 @@ def dataset_fn_wrapper(dataset): "ml_tags_reversed": constants.ML_TAGS_TSV_PATH, "ml_reviews": constants.ML_REVIEWS_TSV_PATH, "probe_1": constants.PROBE_1_TSV_PATH, - "probe_1_sequences": constants.PROBE_1_TSV_PATH, + "probe_1_sequences": constants.PROBE_1_SEQ_TSV_PATH, "probe_2": constants.PROBE_2_TSV_PATH, "probe_3": constants.PROBE_3_TSV_PATH, "probe_4": constants.PROBE_4_TSV_PATH