diff --git a/main.py b/main.py index 5c2cc1a11..b656b83e0 100644 --- a/main.py +++ b/main.py @@ -198,6 +198,13 @@ def poster(poster): uid = poster v = by_uid["papers"][uid] data = _data() + + data["openreview"] = format_paper(by_uid["papers"][uid]) + data["id"] = uid + data["paper_recs"] = [ + format_paper(by_uid["papers"][n]) for n in site_data["paper_recs"][uid] + ][1:] + data["paper"] = format_paper(v) return render_template("poster.html", **data) diff --git a/scripts/README.md b/scripts/README.md index 3aec96e8e..34b4c752a 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,5 +1,8 @@ This directory contains extensions to help support the mini-conf library. +For the updated procedure on getting similar papers + recommendations refer to README.recommendations.md + + These include: * `embeddings.py` : For turning abstracts into embeddings. Creates an `embeddings.torch` file. @@ -17,7 +20,7 @@ python3 scripts/generate_version.py build/version.json * `reduce.py` : For creating two-dimensional representations of the embeddings. ```bash -python embeddings.py ../sitedata/papers.csv embeddings.torch > ../sitedata/papers_projection.json +python reduce.py ../sitedata/papers.csv embeddings.torch > ../sitedata/papers_projection.json --projection-method umap ``` * `parse_calendar.py` : to convert a local or remote ICS file to JSON. -- more on importing calendars see [README_Schedule.md](README_Schedule.md) diff --git a/scripts/README.recommendations.md b/scripts/README.recommendations.md new file mode 100644 index 000000000..6d76fc8ac --- /dev/null +++ b/scripts/README.recommendations.md @@ -0,0 +1,35 @@ +# How to get similar paper recommendations + +In this guide we can see how to get paper recommendations using the pretrained model provided +from [ICLR webpage](https://github.com/ICLR/iclr.github.io/tree/master/recommendations) and abstract embeddings. + + + +## Create a visualization based on BERT embeddings + +1. Grab ACL2020 + [papers.csv](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/papers.csv) + from this branch or a more recent version and copy it to `sitedata_acl2020`. +2. Run `python scripts/embeddings.py sitedata_acl2020/papers.csv` to produce the BERT embeddings + for the paper abstracts. +3. Run `python scripts/reduce.py --projection-method [tsne|umap] sitedata_acl2020/papers.csv embeddings.torch > sitedata_acl2020/papers_projection.json` + to produce a 2D projection of the BERT embeddings for visualization. `--projection-method` + selects which dimensionality reduction technique to use. +4. Rerun `make run` and go to the paper visualization page + + +## Produce similar paper recommendations + +1. Run `python scripts/create_recommendations_pickle.py --inp sitedata_acl2020/papers.csv --out cached_or.pkl` to produce `cached_or.pkl`. + This file is compatible with the inference scripts provided in [https://github.com/ICLR/iclr.github.io/tree/master/recommendations](https://github.com/ICLR/iclr.github.io/tree/master/recommendations) +2. Clone [https://github.com/ICLR/iclr.github.io](https://github.com/ICLR/iclr.github.io). You will + need `git-lfs` installed. +3. `cp cached_or.pkl iclr.github.io && cd iclr.github.io/recommendations` +4. Install missing requirements +5. `python recs.py`. This will run inference using a pretrained similarity model and produce the + `rec.pkl` file that contains the paper similarities. +6. You can use the `iclr.github.io/data/pkl_to_json.py` script to produce the `paper_recs.json` + file that contains the similar paper recommendations that can be displayed to the website. Make + sure to modify the filepaths to point to the correct `cached_or.pkl`, `rec.pkl`. +7. Grab the produced `paper_recs.json` file and copy it to `sitedata_acl2020`. A version of this file + produced using this method is [here](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/paper_recs.json) diff --git a/scripts/create_recommendations_pickle.py b/scripts/create_recommendations_pickle.py new file mode 100644 index 000000000..dea19581a --- /dev/null +++ b/scripts/create_recommendations_pickle.py @@ -0,0 +1,50 @@ +import argparse +import csv +import pickle + +import openreview # type: ignore + +# No type hints for openreview-py package. Ignore mypy + + +def read_entries(papers_csv): + with open(papers_csv, "r") as fd: + entries = list(csv.reader(fd, skipinitialspace=True)) + entries = entries[1:] # skip header + + return entries + + +def dump_cached_or(entries, out_pickle): + cached_or = {} + for entry in entries: + cached_or[entry[0]] = openreview.Note( # id + "", [], [], [], {"abstract": entry[3], "title": entry[1]} + ) # Hack. ICLR Recommender script accepts Openreview notes + + with open(out_pickle, "wb") as fd: + pickle.dump(cached_or, fd) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Convert CSV from original ACL format to Miniconf " + "compatible format" + ) + parser.add_argument("--inp", type=str, help="papers.csv") + parser.add_argument( + "--out", + type=str, + help="Dump entries into a pickle compatible with " "ICLR Recommendation engine", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + entries = read_entries(args.inp) + dump_cached_or(entries, args.out) + + +if __name__ == "__main__": + main() diff --git a/scripts/reduce.py b/scripts/reduce.py index 9d69fb9ab..5fd1d4805 100644 --- a/scripts/reduce.py +++ b/scripts/reduce.py @@ -4,6 +4,9 @@ import sklearn.manifold import torch +import umap # type: ignore + +# No type stubs for umap-learn. Ignore mypy def parse_arguments(): @@ -11,6 +14,7 @@ def parse_arguments(): parser.add_argument("papers", default=False, help="paper file") parser.add_argument("embeddings", default=False, help="embeddings file to shrink") + parser.add_argument("--projection-method", default="tsne", help="[umap|tsne]") return parser.parse_args() @@ -18,7 +22,16 @@ def parse_arguments(): if __name__ == "__main__": args = parse_arguments() emb = torch.load(args.embeddings) - out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy()) + if args.projection_method == "tsne": + out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy()) + elif args.projection_method == "umap": + out = umap.UMAP( + n_neighbors=5, min_dist=0.3, metric="correlation", n_components=2 + ).fit_transform(emb.numpy()) + else: + print("invalid projection-method: {}".format(args.projection_method)) + print("Falling back to T-SNE") + out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy()) d = [] with open(args.papers, "r") as f: abstracts = list(csv.DictReader(f)) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 01bfdcd3c..0da07c264 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,4 +1,6 @@ transformers sklearn +umap-learn +openreview-py torch==1.4.0 ics diff --git a/templates/poster.html b/templates/poster.html index b673d38cb..e199c2ca8 100644 --- a/templates/poster.html +++ b/templates/poster.html @@ -139,6 +139,33 @@
}) +
+
+

Similar Papers

+
+
+

+
+
+ {% for recommended in paper_recs %} +
+
+
+ +
{{recommended.content.title}}
+
+
+ {% for a in recommended.content.authors %} + {{a}}, + {% endfor %} +
+
+
+
+
+ {% endfor %} +
+