From d766d3e1cf4ec937ffe7beb8981873287fa21d58 Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Fri, 12 Jun 2020 17:08:51 +0300 Subject: [PATCH 01/14] Add papers.csv creator and UMAP projection in scripts/reduce.py --- scripts/README.md | 6 +- scripts/create_papers_csv.py | 122 +++++++++++++++++++++++++++++++++++ scripts/reduce.py | 16 ++++- scripts/requirements.txt | 1 + 4 files changed, 143 insertions(+), 2 deletions(-) create mode 100644 scripts/create_papers_csv.py diff --git a/scripts/README.md b/scripts/README.md index 9caaf57c4..13a05b4e2 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,5 +1,9 @@ This directory contains extensions to help support the mini-conf library. +Follow the procedure described in [this gist](https://gist.github.com/georgepar/3d5cda48c50c6ee57f56aaea9b99603d) to obtain +the embeddings and the paper projections. + + These include: * `embeddings.py` : For turning abstracts into embeddings. Creates an `embeddings.torch` file. @@ -11,7 +15,7 @@ python embeddings.py ../sitedata/papers.csv * `reduce.py` : For creating two-dimensional representations of the embeddings. ```bash -python embeddings.py ../sitedata/papers.csv embeddings.torch > ../sitedata/papers_projection.json +python reduce.py ../sitedata/papers.csv embeddings.torch > ../sitedata/papers_projection.json --projection-method umap ``` * `parse_calendar.py` : to convert a local or remote ICS file to JSON. -- more on importing calendars see [README_Schedule.md](README_Schedule.md) diff --git a/scripts/create_papers_csv.py b/scripts/create_papers_csv.py new file mode 100644 index 000000000..c5ffc9941 --- /dev/null +++ b/scripts/create_papers_csv.py @@ -0,0 +1,122 @@ +import argparse +import numpy as np +import pickle +from sklearn.feature_extraction.text import TfidfVectorizer +import openreview + +OLD_HEADERS = [ + 'Submission ID', 'Title', 'Authors', 'Abstract', 'Submission Type' +] + +NEW_HEADER = ['UID', 'title', 'authors', 'abstract', 'keywords', 'session'] + + +class CsvConverter(object): + def __init__(self, n_keywords=5): + self.n_keywords = n_keywords + + def get_uid(self, entry): + return entry['Submission ID'] + + def get_title(self, entry): + return entry['Title'] + + def get_authors(self, entry): + return entry['Authors'] + + def get_abstract(self, entry): + return entry['Abstract'] + + def get_keywords(self, entry, tfidf_model): + scores = tfidf_model.transform([entry['Abstract']])[0] + words = np.array(tfidf_model.get_feature_names()) + sorted_scores = np.argsort(scores.data) + top_scores = sorted_scores[:-(self.n_keywords + 1): -1] + keywords = words[scores.indices[top_scores]].tolist() + return '|'.join(keywords) + + def keyword_model(self, abstracts): + # Replace this if we get a list of keywords + # For now return top TF-IDF terms of words in abstracts + tfidf = TfidfVectorizer(stop_words='english').fit(abstracts) + return tfidf + + def get_session(self, entry): + # FIXME: Use this as a placeholder until we get some session info + return entry['Submission Type'] + + def parse_accepted_papers(self, tsv_file): + with open(tsv_file, 'r') as fd: + lines = [l.strip().split('\t') for l in fd] + header, paper_info = lines[0], lines[1:] + papers = [] + for paper in paper_info: + entry = {} + for i, h in enumerate(header): + entry[h] = paper[i] + papers.append(entry) + return papers + + def convert_entries(self, entries): + tfidf = self.keyword_model([e['Abstract'] for e in entries]) + + def get_new_entry(e): + return ( + e['Submission ID'], + e['Title'], + '|'.join(e['Authors'].split(',')), + '"{}"'.format(e['Abstract']), + self.get_keywords(e, tfidf), + # FIXME: Use this as a placeholder until session info + # is available + e['Submission Type'] + ) + new_entries = [get_new_entry(e) for e in entries] + return new_entries + + def convert(self, old_tsv, papers_csv, out_pickle=None): + old_entries = self.parse_accepted_papers(old_tsv) + new_entries = self.convert_entries(old_entries) + with open(papers_csv, 'w') as fd: + header = ','.join(NEW_HEADER) + fd.write('{}\n'.format(header)) + for entry in new_entries: + e = ','.join(entry) + fd.write('{}\n'.format(e)) + if out_pickle is not None: + cached_or = {} + for entry in new_entries: + cached_or[ + entry[0] # id + ] = openreview.Note( + '', [], [], [], + {'abstract': entry[3], 'title': entry[1]} + ) # Hack. ICLR Recommender script accepts Openreview notes + + with open(out_pickle, 'wb') as fd: + pickle.dump(cached_or, fd) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Convert CSV from original ACL format to Miniconf compatible format" + ) + parser.add_argument('--inp', type=str, help='Original ACL CSV') + parser.add_argument('--out', type=str, help='papers.csv') + parser.add_argument( + '--out-pickle', type=str, + help='Dump entries into a pickle compatible with ICLR Recommendation engine' + ) + parser.add_argument( + '--n-keywords', type=int, default=3, help='Number of keywords to keep') + return parser.parse_args() + + +def main(): + args = parse_args() + csv_converter = CsvConverter(n_keywords=args.n_keywords) + csv_converter.convert(args.inp, args.out, out_pickle=args.out_pickle) + + +if __name__ == '__main__': + main() diff --git a/scripts/reduce.py b/scripts/reduce.py index 9d69fb9ab..4c18f6b55 100644 --- a/scripts/reduce.py +++ b/scripts/reduce.py @@ -2,6 +2,8 @@ import csv import json +import umap + import sklearn.manifold import torch @@ -18,7 +20,19 @@ def parse_arguments(): if __name__ == "__main__": args = parse_arguments() emb = torch.load(args.embeddings) - out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy()) + if args.projection_method == 'tsne': + out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy()) + elif args.projection_method == 'umap': + out = umap.UMAP( + n_neighbors=5, + min_dist=0.3, + metric='correlation', + n_components=2 + ).fit_transform(emb.numpy()) + else: + print('invalid projection-method: {}'.format(args.projection_method)) + print('Falling back to T-SNE') + out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy()) d = [] with open(args.papers, "r") as f: abstracts = list(csv.DictReader(f)) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 01bfdcd3c..cc0eb25bf 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,4 +1,5 @@ transformers sklearn +umap-learn torch==1.4.0 ics From d97a8769c82e9f0bd5d28733056f65a611d00abe Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Fri, 12 Jun 2020 17:24:23 +0300 Subject: [PATCH 02/14] Reformat code --- scripts/create_papers_csv.py | 83 ++++++++++++++++++------------------ scripts/reduce.py | 16 +++---- 2 files changed, 48 insertions(+), 51 deletions(-) diff --git a/scripts/create_papers_csv.py b/scripts/create_papers_csv.py index c5ffc9941..9f1d466ce 100644 --- a/scripts/create_papers_csv.py +++ b/scripts/create_papers_csv.py @@ -1,14 +1,14 @@ import argparse -import numpy as np import pickle -from sklearn.feature_extraction.text import TfidfVectorizer + +import numpy as np import openreview +from sklearn.feature_extraction.text import TfidfVectorizer + +OLD_HEADERS = ["Submission ID", "Title", "Authors", "Abstract", "Submission Type"] -OLD_HEADERS = [ - 'Submission ID', 'Title', 'Authors', 'Abstract', 'Submission Type' -] -NEW_HEADER = ['UID', 'title', 'authors', 'abstract', 'keywords', 'session'] +NEW_HEADER = ["UID", "title", "authors", "abstract", "keywords", "session"] class CsvConverter(object): @@ -16,38 +16,38 @@ def __init__(self, n_keywords=5): self.n_keywords = n_keywords def get_uid(self, entry): - return entry['Submission ID'] + return entry["Submission ID"] def get_title(self, entry): - return entry['Title'] + return entry["Title"] def get_authors(self, entry): - return entry['Authors'] + return entry["Authors"] def get_abstract(self, entry): - return entry['Abstract'] + return entry["Abstract"] def get_keywords(self, entry, tfidf_model): - scores = tfidf_model.transform([entry['Abstract']])[0] + scores = tfidf_model.transform([entry["Abstract"]])[0] words = np.array(tfidf_model.get_feature_names()) sorted_scores = np.argsort(scores.data) - top_scores = sorted_scores[:-(self.n_keywords + 1): -1] + top_scores = sorted_scores[: -(self.n_keywords + 1) : -1] keywords = words[scores.indices[top_scores]].tolist() - return '|'.join(keywords) + return "|".join(keywords) def keyword_model(self, abstracts): # Replace this if we get a list of keywords # For now return top TF-IDF terms of words in abstracts - tfidf = TfidfVectorizer(stop_words='english').fit(abstracts) + tfidf = TfidfVectorizer(stop_words="english").fit(abstracts) return tfidf def get_session(self, entry): # FIXME: Use this as a placeholder until we get some session info - return entry['Submission Type'] + return entry["Submission Type"] def parse_accepted_papers(self, tsv_file): - with open(tsv_file, 'r') as fd: - lines = [l.strip().split('\t') for l in fd] + with open(tsv_file, "r") as fd: + lines = [l.strip().split("\t") for l in fd] header, paper_info = lines[0], lines[1:] papers = [] for paper in paper_info: @@ -58,57 +58,58 @@ def parse_accepted_papers(self, tsv_file): return papers def convert_entries(self, entries): - tfidf = self.keyword_model([e['Abstract'] for e in entries]) + tfidf = self.keyword_model([e["Abstract"] for e in entries]) def get_new_entry(e): return ( - e['Submission ID'], - e['Title'], - '|'.join(e['Authors'].split(',')), - '"{}"'.format(e['Abstract']), + e["Submission ID"], + e["Title"], + "|".join(e["Authors"].split(",")), + '"{}"'.format(e["Abstract"]), self.get_keywords(e, tfidf), # FIXME: Use this as a placeholder until session info # is available - e['Submission Type'] + e["Submission Type"], ) + new_entries = [get_new_entry(e) for e in entries] return new_entries def convert(self, old_tsv, papers_csv, out_pickle=None): old_entries = self.parse_accepted_papers(old_tsv) new_entries = self.convert_entries(old_entries) - with open(papers_csv, 'w') as fd: - header = ','.join(NEW_HEADER) - fd.write('{}\n'.format(header)) + with open(papers_csv, "w") as fd: + header = ",".join(NEW_HEADER) + fd.write("{}\n".format(header)) for entry in new_entries: - e = ','.join(entry) - fd.write('{}\n'.format(e)) + e = ",".join(entry) + fd.write("{}\n".format(e)) if out_pickle is not None: cached_or = {} for entry in new_entries: - cached_or[ - entry[0] # id - ] = openreview.Note( - '', [], [], [], - {'abstract': entry[3], 'title': entry[1]} + cached_or[entry[0]] = openreview.Note( # id + "", [], [], [], {"abstract": entry[3], "title": entry[1]} ) # Hack. ICLR Recommender script accepts Openreview notes - with open(out_pickle, 'wb') as fd: + with open(out_pickle, "wb") as fd: pickle.dump(cached_or, fd) def parse_args(): parser = argparse.ArgumentParser( - description="Convert CSV from original ACL format to Miniconf compatible format" + description="Convert CSV from original ACL format to Miniconf " + "compatible format" ) - parser.add_argument('--inp', type=str, help='Original ACL CSV') - parser.add_argument('--out', type=str, help='papers.csv') + parser.add_argument("--inp", type=str, help="Original ACL CSV") + parser.add_argument("--out", type=str, help="papers.csv") parser.add_argument( - '--out-pickle', type=str, - help='Dump entries into a pickle compatible with ICLR Recommendation engine' + "--out-pickle", + type=str, + help="Dump entries into a pickle compatible with " "ICLR Recommendation engine", ) parser.add_argument( - '--n-keywords', type=int, default=3, help='Number of keywords to keep') + "--n-keywords", type=int, default=3, help="Number of keywords to keep" + ) return parser.parse_args() @@ -118,5 +119,5 @@ def main(): csv_converter.convert(args.inp, args.out, out_pickle=args.out_pickle) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/scripts/reduce.py b/scripts/reduce.py index 4c18f6b55..e13f03153 100644 --- a/scripts/reduce.py +++ b/scripts/reduce.py @@ -2,10 +2,9 @@ import csv import json -import umap - import sklearn.manifold import torch +import umap def parse_arguments(): @@ -20,18 +19,15 @@ def parse_arguments(): if __name__ == "__main__": args = parse_arguments() emb = torch.load(args.embeddings) - if args.projection_method == 'tsne': + if args.projection_method == "tsne": out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy()) - elif args.projection_method == 'umap': + elif args.projection_method == "umap": out = umap.UMAP( - n_neighbors=5, - min_dist=0.3, - metric='correlation', - n_components=2 + n_neighbors=5, min_dist=0.3, metric="correlation", n_components=2 ).fit_transform(emb.numpy()) else: - print('invalid projection-method: {}'.format(args.projection_method)) - print('Falling back to T-SNE') + print("invalid projection-method: {}".format(args.projection_method)) + print("Falling back to T-SNE") out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy()) d = [] with open(args.papers, "r") as f: From 2fd143560c524b9a96889743b3e4adb25716a8eb Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Fri, 12 Jun 2020 18:30:47 +0300 Subject: [PATCH 03/14] Add similar papers in poster page --- main.py | 5 +++++ templates/poster.html | 27 +++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/main.py b/main.py index 037842e1f..a56281504 100644 --- a/main.py +++ b/main.py @@ -187,6 +187,11 @@ def poster(poster): uid = poster v = by_uid["papers"][uid] data = _data() + + data["openreview"] = format_paper(by_uid["papers"][uid]) + data["id"] = uid + data["paper_recs"] = [format_paper(by_uid["papers"][n]) for n in site_data["paper_recs"][uid]][1:] + data["paper"] = format_paper(v) return render_template("poster.html", **data) diff --git a/templates/poster.html b/templates/poster.html index c5d91c6e2..476767e62 100644 --- a/templates/poster.html +++ b/templates/poster.html @@ -123,6 +123,33 @@
}) +
+
+

Similar Papers

+
+
+

+
+
+ {% for openreview in paper_recs %} +
+
+
+ +
{{openreview.content.title}}
+
+
+ {% for a in openreview.content.authors %} + {{a}}, + {% endfor %} +
+
+
+
+
+ {% endfor %} +
+
From 31c19f76694a88c82db4061eb1b0e5ce34c72e25 Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Fri, 12 Jun 2020 19:02:25 +0300 Subject: [PATCH 04/14] Add guide to produce similar paper recommendations --- scripts/README.recommendations.md | 41 +++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 scripts/README.recommendations.md diff --git a/scripts/README.recommendations.md b/scripts/README.recommendations.md new file mode 100644 index 000000000..df0cd554c --- /dev/null +++ b/scripts/README.recommendations.md @@ -0,0 +1,41 @@ +# How to get similar paper recommendations + +In this guide we can see how to get paper recommendations using the pretrained model provided +from [ICLR webpage](https://github.com/ICLR/iclr.github.io/tree/master/recommendations) and abstract embeddings. + + + +## Create a visualization based on BERT embeddings + +1. Grab ACL2020 + [papers.csv](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/papers.csv) + from this branch or a more recent version and copy it to `sitedata_acl2020`. +2. Run `python scripts/embeddings.py sitedata_acl2020/papers.csv` to produce the BERT embeddings + for the paper abstracts. +3. Run `python reduce.py ../acl-2020-virtual-conference-sitedata/papers.csv embeddings.torch > ../sitedata_acl2020/papers_projection.json --projection-method [tsne|umap]` + to produce a 2D projection of the BERT embeddings for visualization. `--projection-method` + selects which dimensionality reduction technique to use. +4. Rerun `make run` and go to the paper visualization page + + +## Produce similar paper recommendations + +1. Grab the + [acl2020\_accepted\_papers.tsv](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/acl2020_accepted_papers.tsv) + file. +2. Run `python scripts/create_papers_csv.py --inp acl2020_accepted_papers.tsv --out dummy.csv --out-pickle cached_or.pkl --n-keywords 5` to produce `cached_or.pkl`. + This file is compatible with the inference scripts provided in [https://github.com/ICLR/iclr.github.io/tree/master/recommendations](https://github.com/ICLR/iclr.github.io/tree/master/recommendations) +3. Clone [https://github.com/ICLR/iclr.github.io](https://github.com/ICLR/iclr.github.io). You will + need `git-lfs` installed. +4. `cp cached_or.pkl iclr.github.io && cd iclr.github.io/recommendations` +5. Install missing requirements +6. `python recs.py`. This will run inference using a pretrained similarity model and produce the + `rec.pkl` file that contains the paper similarities. +7. You can use the `iclr.github.io/data/pkl_to_json.py` script to produce the `paper_recs.json` + file that contains the similar paper recommendations that can be displayed to the website. Make + sure to modify the filepaths to point to the correct `cached_or.pkl`, `rec.pkl`. +8. Grab the produced `paper_recs.json` file and copy it to `sitedata_acl2020`. A version of this file + produced using this method is + [here](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/paper_recs.json) +9. I have already modified the `poster.html` template and `main.py` to display the paper + recommendations in `54_add_similar_papers_graph` branch. From e556362b9b492f0a0a387d70361a31c8a82e4ecc Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Fri, 12 Jun 2020 19:51:05 +0300 Subject: [PATCH 05/14] Reformat main.py --- main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index a56281504..9b0c98e80 100644 --- a/main.py +++ b/main.py @@ -190,7 +190,9 @@ def poster(poster): data["openreview"] = format_paper(by_uid["papers"][uid]) data["id"] = uid - data["paper_recs"] = [format_paper(by_uid["papers"][n]) for n in site_data["paper_recs"][uid]][1:] + data["paper_recs"] = [ + format_paper(by_uid["papers"][n]) for n in site_data["paper_recs"][uid] + ][1:] data["paper"] = format_paper(v) return render_template("poster.html", **data) From 5af073f7329fb35f99090de27ef4e3836ceeb8ad Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Fri, 12 Jun 2020 20:32:49 +0300 Subject: [PATCH 06/14] make image_path configurable --- scripts/create_papers_csv.py | 2 ++ templates/poster.html | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/create_papers_csv.py b/scripts/create_papers_csv.py index 9f1d466ce..d4a22b194 100644 --- a/scripts/create_papers_csv.py +++ b/scripts/create_papers_csv.py @@ -3,8 +3,10 @@ import numpy as np import openreview + from sklearn.feature_extraction.text import TfidfVectorizer + OLD_HEADERS = ["Submission ID", "Title", "Authors", "Abstract", "Submission Type"] diff --git a/templates/poster.html b/templates/poster.html index 476767e62..a6d796506 100644 --- a/templates/poster.html +++ b/templates/poster.html @@ -143,7 +143,7 @@
{{a}}, {% endfor %}
-
+
From 857fc8c5911d1c06ccfabeb6245f2fc70dd96950 Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Mon, 15 Jun 2020 13:51:15 +0300 Subject: [PATCH 07/14] format create_papers_csv --- scripts/create_papers_csv.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/create_papers_csv.py b/scripts/create_papers_csv.py index d4a22b194..b290ba680 100644 --- a/scripts/create_papers_csv.py +++ b/scripts/create_papers_csv.py @@ -2,10 +2,9 @@ import pickle import numpy as np -import openreview - from sklearn.feature_extraction.text import TfidfVectorizer +import openreview OLD_HEADERS = ["Submission ID", "Title", "Authors", "Abstract", "Submission Type"] From 07b2f8ba260585a628e4dcef559f961f5bae0390 Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Mon, 15 Jun 2020 14:43:24 +0300 Subject: [PATCH 08/14] Update with latest papers.csv and simplify code --- scripts/README.recommendations.md | 2 +- scripts/create_papers_csv.py | 124 ----------------------- scripts/create_recommendations_pickle.py | 50 +++++++++ scripts/reduce.py | 1 + scripts/requirements.txt | 1 + 5 files changed, 53 insertions(+), 125 deletions(-) delete mode 100644 scripts/create_papers_csv.py create mode 100644 scripts/create_recommendations_pickle.py diff --git a/scripts/README.recommendations.md b/scripts/README.recommendations.md index df0cd554c..1bc2bdd26 100644 --- a/scripts/README.recommendations.md +++ b/scripts/README.recommendations.md @@ -12,7 +12,7 @@ from [ICLR webpage](https://github.com/ICLR/iclr.github.io/tree/master/recommend from this branch or a more recent version and copy it to `sitedata_acl2020`. 2. Run `python scripts/embeddings.py sitedata_acl2020/papers.csv` to produce the BERT embeddings for the paper abstracts. -3. Run `python reduce.py ../acl-2020-virtual-conference-sitedata/papers.csv embeddings.torch > ../sitedata_acl2020/papers_projection.json --projection-method [tsne|umap]` +3. Run `python scripts/reduce.py --projection-method [tsne|umap] acl-2020-virtual-conference-sitedata/papers.csv embeddings.torch > sitedata_acl2020/papers_projection.json` to produce a 2D projection of the BERT embeddings for visualization. `--projection-method` selects which dimensionality reduction technique to use. 4. Rerun `make run` and go to the paper visualization page diff --git a/scripts/create_papers_csv.py b/scripts/create_papers_csv.py deleted file mode 100644 index b290ba680..000000000 --- a/scripts/create_papers_csv.py +++ /dev/null @@ -1,124 +0,0 @@ -import argparse -import pickle - -import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer - -import openreview - -OLD_HEADERS = ["Submission ID", "Title", "Authors", "Abstract", "Submission Type"] - - -NEW_HEADER = ["UID", "title", "authors", "abstract", "keywords", "session"] - - -class CsvConverter(object): - def __init__(self, n_keywords=5): - self.n_keywords = n_keywords - - def get_uid(self, entry): - return entry["Submission ID"] - - def get_title(self, entry): - return entry["Title"] - - def get_authors(self, entry): - return entry["Authors"] - - def get_abstract(self, entry): - return entry["Abstract"] - - def get_keywords(self, entry, tfidf_model): - scores = tfidf_model.transform([entry["Abstract"]])[0] - words = np.array(tfidf_model.get_feature_names()) - sorted_scores = np.argsort(scores.data) - top_scores = sorted_scores[: -(self.n_keywords + 1) : -1] - keywords = words[scores.indices[top_scores]].tolist() - return "|".join(keywords) - - def keyword_model(self, abstracts): - # Replace this if we get a list of keywords - # For now return top TF-IDF terms of words in abstracts - tfidf = TfidfVectorizer(stop_words="english").fit(abstracts) - return tfidf - - def get_session(self, entry): - # FIXME: Use this as a placeholder until we get some session info - return entry["Submission Type"] - - def parse_accepted_papers(self, tsv_file): - with open(tsv_file, "r") as fd: - lines = [l.strip().split("\t") for l in fd] - header, paper_info = lines[0], lines[1:] - papers = [] - for paper in paper_info: - entry = {} - for i, h in enumerate(header): - entry[h] = paper[i] - papers.append(entry) - return papers - - def convert_entries(self, entries): - tfidf = self.keyword_model([e["Abstract"] for e in entries]) - - def get_new_entry(e): - return ( - e["Submission ID"], - e["Title"], - "|".join(e["Authors"].split(",")), - '"{}"'.format(e["Abstract"]), - self.get_keywords(e, tfidf), - # FIXME: Use this as a placeholder until session info - # is available - e["Submission Type"], - ) - - new_entries = [get_new_entry(e) for e in entries] - return new_entries - - def convert(self, old_tsv, papers_csv, out_pickle=None): - old_entries = self.parse_accepted_papers(old_tsv) - new_entries = self.convert_entries(old_entries) - with open(papers_csv, "w") as fd: - header = ",".join(NEW_HEADER) - fd.write("{}\n".format(header)) - for entry in new_entries: - e = ",".join(entry) - fd.write("{}\n".format(e)) - if out_pickle is not None: - cached_or = {} - for entry in new_entries: - cached_or[entry[0]] = openreview.Note( # id - "", [], [], [], {"abstract": entry[3], "title": entry[1]} - ) # Hack. ICLR Recommender script accepts Openreview notes - - with open(out_pickle, "wb") as fd: - pickle.dump(cached_or, fd) - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Convert CSV from original ACL format to Miniconf " - "compatible format" - ) - parser.add_argument("--inp", type=str, help="Original ACL CSV") - parser.add_argument("--out", type=str, help="papers.csv") - parser.add_argument( - "--out-pickle", - type=str, - help="Dump entries into a pickle compatible with " "ICLR Recommendation engine", - ) - parser.add_argument( - "--n-keywords", type=int, default=3, help="Number of keywords to keep" - ) - return parser.parse_args() - - -def main(): - args = parse_args() - csv_converter = CsvConverter(n_keywords=args.n_keywords) - csv_converter.convert(args.inp, args.out, out_pickle=args.out_pickle) - - -if __name__ == "__main__": - main() diff --git a/scripts/create_recommendations_pickle.py b/scripts/create_recommendations_pickle.py new file mode 100644 index 000000000..5e041455f --- /dev/null +++ b/scripts/create_recommendations_pickle.py @@ -0,0 +1,50 @@ +import argparse +import csv +import pickle + +import numpy as np +import openreview +from sklearn.feature_extraction.text import TfidfVectorizer + + +def read_entries(papers_csv): + with open(papers_csv, "r") as fd: + entries = list(csv.reader(fd, skipinitialspace=True)) + entries = entries[1:] # skip header + + return entries + + +def dump_cached_or(entries, out_pickle): + cached_or = {} + for entry in entries: + cached_or[entry[0]] = openreview.Note( # id + "", [], [], [], {"abstract": entry[3], "title": entry[1]} + ) # Hack. ICLR Recommender script accepts Openreview notes + + with open(out_pickle, "wb") as fd: + pickle.dump(cached_or, fd) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Convert CSV from original ACL format to Miniconf " + "compatible format" + ) + parser.add_argument("--inp", type=str, help="papers.csv") + parser.add_argument( + "--out", + type=str, + help="Dump entries into a pickle compatible with " "ICLR Recommendation engine", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + entries = read_entries(args.inp) + dump_cached_or(entries, args.out) + + +if __name__ == "__main__": + main() diff --git a/scripts/reduce.py b/scripts/reduce.py index e13f03153..0f24c5083 100644 --- a/scripts/reduce.py +++ b/scripts/reduce.py @@ -12,6 +12,7 @@ def parse_arguments(): parser.add_argument("papers", default=False, help="paper file") parser.add_argument("embeddings", default=False, help="embeddings file to shrink") + parser.add_argument("--projection-method", default="tsne", help="[umap|tsne]") return parser.parse_args() diff --git a/scripts/requirements.txt b/scripts/requirements.txt index cc0eb25bf..0da07c264 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,5 +1,6 @@ transformers sklearn umap-learn +openreview-py torch==1.4.0 ics From b21340643c7363a52393bb159212b26dc1026304 Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Mon, 15 Jun 2020 14:47:38 +0300 Subject: [PATCH 09/14] Remove unused imports --- scripts/create_recommendations_pickle.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/create_recommendations_pickle.py b/scripts/create_recommendations_pickle.py index 5e041455f..1e7ec7f26 100644 --- a/scripts/create_recommendations_pickle.py +++ b/scripts/create_recommendations_pickle.py @@ -2,9 +2,7 @@ import csv import pickle -import numpy as np import openreview -from sklearn.feature_extraction.text import TfidfVectorizer def read_entries(papers_csv): From 191363573dbc8f6b0828cb643496030db7230275 Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Mon, 15 Jun 2020 15:01:58 +0300 Subject: [PATCH 10/14] Ignore typecheck for openreview and umap-learn --- scripts/create_recommendations_pickle.py | 4 +++- scripts/reduce.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/create_recommendations_pickle.py b/scripts/create_recommendations_pickle.py index 1e7ec7f26..dea19581a 100644 --- a/scripts/create_recommendations_pickle.py +++ b/scripts/create_recommendations_pickle.py @@ -2,7 +2,9 @@ import csv import pickle -import openreview +import openreview # type: ignore + +# No type hints for openreview-py package. Ignore mypy def read_entries(papers_csv): diff --git a/scripts/reduce.py b/scripts/reduce.py index 0f24c5083..5fd1d4805 100644 --- a/scripts/reduce.py +++ b/scripts/reduce.py @@ -4,7 +4,9 @@ import sklearn.manifold import torch -import umap +import umap # type: ignore + +# No type stubs for umap-learn. Ignore mypy def parse_arguments(): From 2a52660e974b2c08e4d3654bfb7f8ce05e1c6e32 Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Mon, 15 Jun 2020 15:40:32 +0300 Subject: [PATCH 11/14] Modify poster.html to get correct id field --- templates/poster.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/poster.html b/templates/poster.html index 9d2e09f5e..a03ff2a36 100644 --- a/templates/poster.html +++ b/templates/poster.html @@ -151,7 +151,7 @@

Similar Papers

- +
{{openreview.content.title}}
@@ -159,7 +159,7 @@
{{a}}, {% endfor %}
-
+
From 7571602d0a535d1dbe5f81340206050653397ab2 Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Mon, 15 Jun 2020 15:44:39 +0300 Subject: [PATCH 12/14] refactor templates/poster.html --- templates/poster.html | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/templates/poster.html b/templates/poster.html index a03ff2a36..e199c2ca8 100644 --- a/templates/poster.html +++ b/templates/poster.html @@ -147,19 +147,19 @@

Similar Papers

- {% for openreview in paper_recs %} + {% for recommended in paper_recs %}
- -
{{openreview.content.title}}
+
+
{{recommended.content.title}}
- {% for a in openreview.content.authors %} + {% for a in recommended.content.authors %} {{a}}, {% endfor %}
-
+
From 9776a15dc72852257912a388303d7e0ba67c3330 Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Mon, 15 Jun 2020 15:50:03 +0300 Subject: [PATCH 13/14] Update README.md --- scripts/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index fcef59000..34b4c752a 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,7 +1,6 @@ This directory contains extensions to help support the mini-conf library. -Follow the procedure described in [this gist](https://gist.github.com/georgepar/3d5cda48c50c6ee57f56aaea9b99603d) to obtain -the embeddings and the paper projections. +For the updated procedure on getting similar papers + recommendations refer to README.recommendations.md These include: From b5a94a89886aecdbd65b7afb78bdeff7fb1a35b5 Mon Sep 17 00:00:00 2001 From: Giorgos Paraskevopoulos Date: Mon, 15 Jun 2020 15:53:53 +0300 Subject: [PATCH 14/14] Update README.recommendations.md --- scripts/README.recommendations.md | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/scripts/README.recommendations.md b/scripts/README.recommendations.md index 1bc2bdd26..6d76fc8ac 100644 --- a/scripts/README.recommendations.md +++ b/scripts/README.recommendations.md @@ -12,7 +12,7 @@ from [ICLR webpage](https://github.com/ICLR/iclr.github.io/tree/master/recommend from this branch or a more recent version and copy it to `sitedata_acl2020`. 2. Run `python scripts/embeddings.py sitedata_acl2020/papers.csv` to produce the BERT embeddings for the paper abstracts. -3. Run `python scripts/reduce.py --projection-method [tsne|umap] acl-2020-virtual-conference-sitedata/papers.csv embeddings.torch > sitedata_acl2020/papers_projection.json` +3. Run `python scripts/reduce.py --projection-method [tsne|umap] sitedata_acl2020/papers.csv embeddings.torch > sitedata_acl2020/papers_projection.json` to produce a 2D projection of the BERT embeddings for visualization. `--projection-method` selects which dimensionality reduction technique to use. 4. Rerun `make run` and go to the paper visualization page @@ -20,22 +20,16 @@ from [ICLR webpage](https://github.com/ICLR/iclr.github.io/tree/master/recommend ## Produce similar paper recommendations -1. Grab the - [acl2020\_accepted\_papers.tsv](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/acl2020_accepted_papers.tsv) - file. -2. Run `python scripts/create_papers_csv.py --inp acl2020_accepted_papers.tsv --out dummy.csv --out-pickle cached_or.pkl --n-keywords 5` to produce `cached_or.pkl`. +1. Run `python scripts/create_recommendations_pickle.py --inp sitedata_acl2020/papers.csv --out cached_or.pkl` to produce `cached_or.pkl`. This file is compatible with the inference scripts provided in [https://github.com/ICLR/iclr.github.io/tree/master/recommendations](https://github.com/ICLR/iclr.github.io/tree/master/recommendations) -3. Clone [https://github.com/ICLR/iclr.github.io](https://github.com/ICLR/iclr.github.io). You will +2. Clone [https://github.com/ICLR/iclr.github.io](https://github.com/ICLR/iclr.github.io). You will need `git-lfs` installed. -4. `cp cached_or.pkl iclr.github.io && cd iclr.github.io/recommendations` -5. Install missing requirements -6. `python recs.py`. This will run inference using a pretrained similarity model and produce the +3. `cp cached_or.pkl iclr.github.io && cd iclr.github.io/recommendations` +4. Install missing requirements +5. `python recs.py`. This will run inference using a pretrained similarity model and produce the `rec.pkl` file that contains the paper similarities. -7. You can use the `iclr.github.io/data/pkl_to_json.py` script to produce the `paper_recs.json` +6. You can use the `iclr.github.io/data/pkl_to_json.py` script to produce the `paper_recs.json` file that contains the similar paper recommendations that can be displayed to the website. Make sure to modify the filepaths to point to the correct `cached_or.pkl`, `rec.pkl`. -8. Grab the produced `paper_recs.json` file and copy it to `sitedata_acl2020`. A version of this file - produced using this method is - [here](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/paper_recs.json) -9. I have already modified the `poster.html` template and `main.py` to display the paper - recommendations in `54_add_similar_papers_graph` branch. +7. Grab the produced `paper_recs.json` file and copy it to `sitedata_acl2020`. A version of this file + produced using this method is [here](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/paper_recs.json)