From d766d3e1cf4ec937ffe7beb8981873287fa21d58 Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Fri, 12 Jun 2020 17:08:51 +0300
Subject: [PATCH 01/14] Add papers.csv creator and UMAP projection in
 scripts/reduce.py

---
 scripts/README.md            |   6 +-
 scripts/create_papers_csv.py | 122 +++++++++++++++++++++++++++++++++++
 scripts/reduce.py            |  16 ++++-
 scripts/requirements.txt     |   1 +
 4 files changed, 143 insertions(+), 2 deletions(-)
 create mode 100644 scripts/create_papers_csv.py

diff --git a/scripts/README.md b/scripts/README.md
index 9caaf57c4..13a05b4e2 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -1,5 +1,9 @@
 This directory contains extensions to help support the mini-conf library.
 
+Follow the procedure described in [this gist](https://gist.github.com/georgepar/3d5cda48c50c6ee57f56aaea9b99603d) to obtain
+the embeddings and the paper projections.
+
+
 These include:
 
 * `embeddings.py` : For turning abstracts into embeddings. Creates an `embeddings.torch` file. 
@@ -11,7 +15,7 @@ python embeddings.py ../sitedata/papers.csv
 * `reduce.py` : For creating two-dimensional representations of the embeddings.
 
 ```bash
-python embeddings.py ../sitedata/papers.csv embeddings.torch > ../sitedata/papers_projection.json
+python reduce.py ../sitedata/papers.csv embeddings.torch > ../sitedata/papers_projection.json --projection-method umap
 ```
 
 * `parse_calendar.py` : to convert a local or remote ICS file to JSON. -- more on importing calendars see [README_Schedule.md](README_Schedule.md)
diff --git a/scripts/create_papers_csv.py b/scripts/create_papers_csv.py
new file mode 100644
index 000000000..c5ffc9941
--- /dev/null
+++ b/scripts/create_papers_csv.py
@@ -0,0 +1,122 @@
+import argparse
+import numpy as np
+import pickle
+from sklearn.feature_extraction.text import TfidfVectorizer
+import openreview
+
+OLD_HEADERS = [
+    'Submission ID', 'Title', 'Authors', 'Abstract', 'Submission Type'
+]
+
+NEW_HEADER = ['UID', 'title', 'authors', 'abstract', 'keywords', 'session']
+
+
+class CsvConverter(object):
+    def __init__(self, n_keywords=5):
+        self.n_keywords = n_keywords
+
+    def get_uid(self, entry):
+        return entry['Submission ID']
+
+    def get_title(self, entry):
+        return entry['Title']
+
+    def get_authors(self, entry):
+        return entry['Authors']
+
+    def get_abstract(self, entry):
+        return entry['Abstract']
+
+    def get_keywords(self, entry, tfidf_model):
+        scores = tfidf_model.transform([entry['Abstract']])[0]
+        words = np.array(tfidf_model.get_feature_names())
+        sorted_scores = np.argsort(scores.data)
+        top_scores = sorted_scores[:-(self.n_keywords + 1): -1]
+        keywords = words[scores.indices[top_scores]].tolist()
+        return '|'.join(keywords)
+
+    def keyword_model(self, abstracts):
+        # Replace this if we get a list of keywords
+        # For now return top TF-IDF terms of words in abstracts
+        tfidf = TfidfVectorizer(stop_words='english').fit(abstracts)
+        return tfidf
+
+    def get_session(self, entry):
+        # FIXME: Use this as a placeholder until we get some session info
+        return entry['Submission Type']
+
+    def parse_accepted_papers(self, tsv_file):
+        with open(tsv_file, 'r') as fd:
+            lines = [l.strip().split('\t') for l in fd]
+            header, paper_info = lines[0], lines[1:]
+        papers = []
+        for paper in paper_info:
+            entry = {}
+            for i, h in enumerate(header):
+                entry[h] = paper[i]
+            papers.append(entry)
+        return papers
+
+    def convert_entries(self, entries):
+        tfidf = self.keyword_model([e['Abstract'] for e in entries])
+
+        def get_new_entry(e):
+            return (
+                e['Submission ID'],
+                e['Title'],
+                '|'.join(e['Authors'].split(',')),
+                '"{}"'.format(e['Abstract']),
+                self.get_keywords(e, tfidf),
+                # FIXME: Use this as a placeholder until session info
+                # is available
+                e['Submission Type']
+            )
+        new_entries = [get_new_entry(e) for e in entries]
+        return new_entries
+
+    def convert(self, old_tsv, papers_csv, out_pickle=None):
+        old_entries = self.parse_accepted_papers(old_tsv)
+        new_entries = self.convert_entries(old_entries)
+        with open(papers_csv, 'w') as fd:
+            header = ','.join(NEW_HEADER)
+            fd.write('{}\n'.format(header))
+            for entry in new_entries:
+                e = ','.join(entry)
+                fd.write('{}\n'.format(e))
+        if out_pickle is not None:
+            cached_or = {}
+            for entry in new_entries:
+                cached_or[
+                    entry[0] # id
+                ] = openreview.Note(
+                    '', [], [], [],
+                    {'abstract': entry[3], 'title': entry[1]}
+                )  # Hack. ICLR Recommender script accepts Openreview notes
+
+            with open(out_pickle, 'wb') as fd:
+                pickle.dump(cached_or, fd)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Convert CSV from original ACL format to Miniconf compatible format"
+    )
+    parser.add_argument('--inp', type=str, help='Original ACL CSV')
+    parser.add_argument('--out', type=str, help='papers.csv')
+    parser.add_argument(
+        '--out-pickle', type=str,
+        help='Dump entries into a pickle compatible with ICLR Recommendation engine'
+    )
+    parser.add_argument(
+        '--n-keywords', type=int, default=3, help='Number of keywords to keep')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    csv_converter = CsvConverter(n_keywords=args.n_keywords)
+    csv_converter.convert(args.inp, args.out, out_pickle=args.out_pickle)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/reduce.py b/scripts/reduce.py
index 9d69fb9ab..4c18f6b55 100644
--- a/scripts/reduce.py
+++ b/scripts/reduce.py
@@ -2,6 +2,8 @@
 import csv
 import json
 
+import umap
+
 import sklearn.manifold
 import torch
 
@@ -18,7 +20,19 @@ def parse_arguments():
 if __name__ == "__main__":
     args = parse_arguments()
     emb = torch.load(args.embeddings)
-    out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy())
+    if args.projection_method == 'tsne':
+        out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy())
+    elif args.projection_method == 'umap':
+        out = umap.UMAP(
+            n_neighbors=5,
+            min_dist=0.3,
+            metric='correlation',
+            n_components=2
+        ).fit_transform(emb.numpy())
+    else:
+        print('invalid projection-method: {}'.format(args.projection_method))
+        print('Falling back to T-SNE')
+        out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy())
     d = []
     with open(args.papers, "r") as f:
         abstracts = list(csv.DictReader(f))
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
index 01bfdcd3c..cc0eb25bf 100644
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -1,4 +1,5 @@
 transformers
 sklearn
+umap-learn
 torch==1.4.0
 ics

From d97a8769c82e9f0bd5d28733056f65a611d00abe Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Fri, 12 Jun 2020 17:24:23 +0300
Subject: [PATCH 02/14] Reformat code

---
 scripts/create_papers_csv.py | 83 ++++++++++++++++++------------------
 scripts/reduce.py            | 16 +++----
 2 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/scripts/create_papers_csv.py b/scripts/create_papers_csv.py
index c5ffc9941..9f1d466ce 100644
--- a/scripts/create_papers_csv.py
+++ b/scripts/create_papers_csv.py
@@ -1,14 +1,14 @@
 import argparse
-import numpy as np
 import pickle
-from sklearn.feature_extraction.text import TfidfVectorizer
+
+import numpy as np
 import openreview
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+OLD_HEADERS = ["Submission ID", "Title", "Authors", "Abstract", "Submission Type"]
 
-OLD_HEADERS = [
-    'Submission ID', 'Title', 'Authors', 'Abstract', 'Submission Type'
-]
 
-NEW_HEADER = ['UID', 'title', 'authors', 'abstract', 'keywords', 'session']
+NEW_HEADER = ["UID", "title", "authors", "abstract", "keywords", "session"]
 
 
 class CsvConverter(object):
@@ -16,38 +16,38 @@ def __init__(self, n_keywords=5):
         self.n_keywords = n_keywords
 
     def get_uid(self, entry):
-        return entry['Submission ID']
+        return entry["Submission ID"]
 
     def get_title(self, entry):
-        return entry['Title']
+        return entry["Title"]
 
     def get_authors(self, entry):
-        return entry['Authors']
+        return entry["Authors"]
 
     def get_abstract(self, entry):
-        return entry['Abstract']
+        return entry["Abstract"]
 
     def get_keywords(self, entry, tfidf_model):
-        scores = tfidf_model.transform([entry['Abstract']])[0]
+        scores = tfidf_model.transform([entry["Abstract"]])[0]
         words = np.array(tfidf_model.get_feature_names())
         sorted_scores = np.argsort(scores.data)
-        top_scores = sorted_scores[:-(self.n_keywords + 1): -1]
+        top_scores = sorted_scores[: -(self.n_keywords + 1) : -1]
         keywords = words[scores.indices[top_scores]].tolist()
-        return '|'.join(keywords)
+        return "|".join(keywords)
 
     def keyword_model(self, abstracts):
         # Replace this if we get a list of keywords
         # For now return top TF-IDF terms of words in abstracts
-        tfidf = TfidfVectorizer(stop_words='english').fit(abstracts)
+        tfidf = TfidfVectorizer(stop_words="english").fit(abstracts)
         return tfidf
 
     def get_session(self, entry):
         # FIXME: Use this as a placeholder until we get some session info
-        return entry['Submission Type']
+        return entry["Submission Type"]
 
     def parse_accepted_papers(self, tsv_file):
-        with open(tsv_file, 'r') as fd:
-            lines = [l.strip().split('\t') for l in fd]
+        with open(tsv_file, "r") as fd:
+            lines = [l.strip().split("\t") for l in fd]
             header, paper_info = lines[0], lines[1:]
         papers = []
         for paper in paper_info:
@@ -58,57 +58,58 @@ def parse_accepted_papers(self, tsv_file):
         return papers
 
     def convert_entries(self, entries):
-        tfidf = self.keyword_model([e['Abstract'] for e in entries])
+        tfidf = self.keyword_model([e["Abstract"] for e in entries])
 
         def get_new_entry(e):
             return (
-                e['Submission ID'],
-                e['Title'],
-                '|'.join(e['Authors'].split(',')),
-                '"{}"'.format(e['Abstract']),
+                e["Submission ID"],
+                e["Title"],
+                "|".join(e["Authors"].split(",")),
+                '"{}"'.format(e["Abstract"]),
                 self.get_keywords(e, tfidf),
                 # FIXME: Use this as a placeholder until session info
                 # is available
-                e['Submission Type']
+                e["Submission Type"],
             )
+
         new_entries = [get_new_entry(e) for e in entries]
         return new_entries
 
     def convert(self, old_tsv, papers_csv, out_pickle=None):
         old_entries = self.parse_accepted_papers(old_tsv)
         new_entries = self.convert_entries(old_entries)
-        with open(papers_csv, 'w') as fd:
-            header = ','.join(NEW_HEADER)
-            fd.write('{}\n'.format(header))
+        with open(papers_csv, "w") as fd:
+            header = ",".join(NEW_HEADER)
+            fd.write("{}\n".format(header))
             for entry in new_entries:
-                e = ','.join(entry)
-                fd.write('{}\n'.format(e))
+                e = ",".join(entry)
+                fd.write("{}\n".format(e))
         if out_pickle is not None:
             cached_or = {}
             for entry in new_entries:
-                cached_or[
-                    entry[0] # id
-                ] = openreview.Note(
-                    '', [], [], [],
-                    {'abstract': entry[3], 'title': entry[1]}
+                cached_or[entry[0]] = openreview.Note(  # id
+                    "", [], [], [], {"abstract": entry[3], "title": entry[1]}
                 )  # Hack. ICLR Recommender script accepts Openreview notes
 
-            with open(out_pickle, 'wb') as fd:
+            with open(out_pickle, "wb") as fd:
                 pickle.dump(cached_or, fd)
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        description="Convert CSV from original ACL format to Miniconf compatible format"
+        description="Convert CSV from original ACL format to Miniconf "
+        "compatible format"
     )
-    parser.add_argument('--inp', type=str, help='Original ACL CSV')
-    parser.add_argument('--out', type=str, help='papers.csv')
+    parser.add_argument("--inp", type=str, help="Original ACL CSV")
+    parser.add_argument("--out", type=str, help="papers.csv")
     parser.add_argument(
-        '--out-pickle', type=str,
-        help='Dump entries into a pickle compatible with ICLR Recommendation engine'
+        "--out-pickle",
+        type=str,
+        help="Dump entries into a pickle compatible with " "ICLR Recommendation engine",
     )
     parser.add_argument(
-        '--n-keywords', type=int, default=3, help='Number of keywords to keep')
+        "--n-keywords", type=int, default=3, help="Number of keywords to keep"
+    )
     return parser.parse_args()
 
 
@@ -118,5 +119,5 @@ def main():
     csv_converter.convert(args.inp, args.out, out_pickle=args.out_pickle)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/scripts/reduce.py b/scripts/reduce.py
index 4c18f6b55..e13f03153 100644
--- a/scripts/reduce.py
+++ b/scripts/reduce.py
@@ -2,10 +2,9 @@
 import csv
 import json
 
-import umap
-
 import sklearn.manifold
 import torch
+import umap
 
 
 def parse_arguments():
@@ -20,18 +19,15 @@ def parse_arguments():
 if __name__ == "__main__":
     args = parse_arguments()
     emb = torch.load(args.embeddings)
-    if args.projection_method == 'tsne':
+    if args.projection_method == "tsne":
         out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy())
-    elif args.projection_method == 'umap':
+    elif args.projection_method == "umap":
         out = umap.UMAP(
-            n_neighbors=5,
-            min_dist=0.3,
-            metric='correlation',
-            n_components=2
+            n_neighbors=5, min_dist=0.3, metric="correlation", n_components=2
         ).fit_transform(emb.numpy())
     else:
-        print('invalid projection-method: {}'.format(args.projection_method))
-        print('Falling back to T-SNE')
+        print("invalid projection-method: {}".format(args.projection_method))
+        print("Falling back to T-SNE")
         out = sklearn.manifold.TSNE(n_components=2).fit_transform(emb.numpy())
     d = []
     with open(args.papers, "r") as f:

From 2fd143560c524b9a96889743b3e4adb25716a8eb Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Fri, 12 Jun 2020 18:30:47 +0300
Subject: [PATCH 03/14] Add similar papers in poster page

---
 main.py               |  5 +++++
 templates/poster.html | 27 +++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/main.py b/main.py
index 037842e1f..a56281504 100644
--- a/main.py
+++ b/main.py
@@ -187,6 +187,11 @@ def poster(poster):
     uid = poster
     v = by_uid["papers"][uid]
     data = _data()
+
+    data["openreview"] = format_paper(by_uid["papers"][uid])
+    data["id"] = uid
+    data["paper_recs"] = [format_paper(by_uid["papers"][n]) for n in site_data["paper_recs"][uid]][1:]
+
     data["paper"] = format_paper(v)
     return render_template("poster.html", **data)
 
diff --git a/templates/poster.html b/templates/poster.html
index c5d91c6e2..476767e62 100644
--- a/templates/poster.html
+++ b/templates/poster.html
@@ -123,6 +123,33 @@ <h5 style="color: red;">
     })
 </script>
 
+<div class="container" style="padding-bottom: 30px; padding-top:30px">
+  <center>
+    <h2> Similar Papers </h2>
+  </center>
+</div>
+<p></p>
+<div  class="container" >
+  <div class="row">
+  {% for openreview in paper_recs %}
+    <div class="col-md-4 col-xs-6">
+      <div class="pp-card" >
+        <div class="pp-card-header" class="text-muted">
+          <a href="poster_{{openreview.content.iclr_id}}.html" class="text-muted">
+            <h5 class="card-title" align="center">{{openreview.content.title}}</h5>
+          </a>
+          <h6 class="card-subtitle text-muted" align="center">
+             {% for a in openreview.content.authors %}
+             {{a}},
+             {% endfor %}
+          </h6>
+          <center><img class="cards_img" src="https://iclr.github.io/iclr-images/{{openreview.content.iclr_id}}.png" width="80%"/></center>
+        </div>
+      </div>
+    </div>
+  {% endfor %}
+  </div>
+</div>
 
 
 

From 31c19f76694a88c82db4061eb1b0e5ce34c72e25 Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Fri, 12 Jun 2020 19:02:25 +0300
Subject: [PATCH 04/14] Add guide to produce similar paper recommendations

---
 scripts/README.recommendations.md | 41 +++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 scripts/README.recommendations.md

diff --git a/scripts/README.recommendations.md b/scripts/README.recommendations.md
new file mode 100644
index 000000000..df0cd554c
--- /dev/null
+++ b/scripts/README.recommendations.md
@@ -0,0 +1,41 @@
+# How to get similar paper recommendations
+
+In this guide we can see how to get paper recommendations using the pretrained model provided
+from [ICLR webpage](https://github.com/ICLR/iclr.github.io/tree/master/recommendations) and abstract embeddings.
+
+
+
+## Create a visualization based on BERT embeddings
+
+1. Grab ACL2020
+   [papers.csv](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/papers.csv)
+   from this branch or a more recent version and copy it to `sitedata_acl2020`.
+2. Run `python scripts/embeddings.py sitedata_acl2020/papers.csv` to produce the BERT embeddings
+   for the paper abstracts.
+3. Run `python reduce.py ../acl-2020-virtual-conference-sitedata/papers.csv embeddings.torch > ../sitedata_acl2020/papers_projection.json --projection-method [tsne|umap]`
+   to produce a 2D projection of the BERT embeddings for visualization. `--projection-method`
+   selects which dimensionality reduction technique to use.
+4. Rerun `make run` and go to the paper visualization page
+
+
+## Produce similar paper recommendations
+
+1. Grab the
+   [acl2020\_accepted\_papers.tsv](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/acl2020_accepted_papers.tsv)
+   file.
+2. Run `python scripts/create_papers_csv.py --inp acl2020_accepted_papers.tsv --out dummy.csv --out-pickle cached_or.pkl --n-keywords 5` to produce `cached_or.pkl`.
+   This file is compatible with the inference scripts provided in [https://github.com/ICLR/iclr.github.io/tree/master/recommendations](https://github.com/ICLR/iclr.github.io/tree/master/recommendations)
+3. Clone [https://github.com/ICLR/iclr.github.io](https://github.com/ICLR/iclr.github.io). You will
+   need `git-lfs` installed.
+4. `cp cached_or.pkl iclr.github.io && cd iclr.github.io/recommendations`
+5. Install missing requirements
+6. `python recs.py`. This will run inference using a pretrained similarity model and produce the
+   `rec.pkl` file that contains the paper similarities.
+7. You can use the `iclr.github.io/data/pkl_to_json.py` script to produce the `paper_recs.json`
+   file that contains the similar paper recommendations that can be displayed to the website. Make
+   sure to modify the filepaths to point to the correct `cached_or.pkl`, `rec.pkl`.
+8. Grab the produced `paper_recs.json` file and copy it to `sitedata_acl2020`. A version of this file
+   produced using this method is
+   [here](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/paper_recs.json)
+9. I have already modified the `poster.html` template and `main.py` to display the paper
+   recommendations in `54_add_similar_papers_graph` branch.

From e556362b9b492f0a0a387d70361a31c8a82e4ecc Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Fri, 12 Jun 2020 19:51:05 +0300
Subject: [PATCH 05/14] Reformat main.py

---
 main.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index a56281504..9b0c98e80 100644
--- a/main.py
+++ b/main.py
@@ -190,7 +190,9 @@ def poster(poster):
 
     data["openreview"] = format_paper(by_uid["papers"][uid])
     data["id"] = uid
-    data["paper_recs"] = [format_paper(by_uid["papers"][n]) for n in site_data["paper_recs"][uid]][1:]
+    data["paper_recs"] = [
+        format_paper(by_uid["papers"][n]) for n in site_data["paper_recs"][uid]
+    ][1:]
 
     data["paper"] = format_paper(v)
     return render_template("poster.html", **data)

From 5af073f7329fb35f99090de27ef4e3836ceeb8ad Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Fri, 12 Jun 2020 20:32:49 +0300
Subject: [PATCH 06/14] make image_path configurable

---
 scripts/create_papers_csv.py | 2 ++
 templates/poster.html        | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/create_papers_csv.py b/scripts/create_papers_csv.py
index 9f1d466ce..d4a22b194 100644
--- a/scripts/create_papers_csv.py
+++ b/scripts/create_papers_csv.py
@@ -3,8 +3,10 @@
 
 import numpy as np
 import openreview
+
 from sklearn.feature_extraction.text import TfidfVectorizer
 
+
 OLD_HEADERS = ["Submission ID", "Title", "Authors", "Abstract", "Submission Type"]
 
 
diff --git a/templates/poster.html b/templates/poster.html
index 476767e62..a6d796506 100644
--- a/templates/poster.html
+++ b/templates/poster.html
@@ -143,7 +143,7 @@ <h6 class="card-subtitle text-muted" align="center">
              {{a}},
              {% endfor %}
           </h6>
-          <center><img class="cards_img" src="https://iclr.github.io/iclr-images/{{openreview.content.iclr_id}}.png" width="80%"/></center>
+          <center><img class="cards_img" src="{{config.image_path}}/{{openreview.content.iclr_id}}.png" width="80%"/></center>
         </div>
       </div>
     </div>

From 857fc8c5911d1c06ccfabeb6245f2fc70dd96950 Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Mon, 15 Jun 2020 13:51:15 +0300
Subject: [PATCH 07/14] format create_papers_csv

---
 scripts/create_papers_csv.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/create_papers_csv.py b/scripts/create_papers_csv.py
index d4a22b194..b290ba680 100644
--- a/scripts/create_papers_csv.py
+++ b/scripts/create_papers_csv.py
@@ -2,10 +2,9 @@
 import pickle
 
 import numpy as np
-import openreview
-
 from sklearn.feature_extraction.text import TfidfVectorizer
 
+import openreview
 
 OLD_HEADERS = ["Submission ID", "Title", "Authors", "Abstract", "Submission Type"]
 

From 07b2f8ba260585a628e4dcef559f961f5bae0390 Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Mon, 15 Jun 2020 14:43:24 +0300
Subject: [PATCH 08/14] Update with latest papers.csv and simplify code

---
 scripts/README.recommendations.md        |   2 +-
 scripts/create_papers_csv.py             | 124 -----------------------
 scripts/create_recommendations_pickle.py |  50 +++++++++
 scripts/reduce.py                        |   1 +
 scripts/requirements.txt                 |   1 +
 5 files changed, 53 insertions(+), 125 deletions(-)
 delete mode 100644 scripts/create_papers_csv.py
 create mode 100644 scripts/create_recommendations_pickle.py

diff --git a/scripts/README.recommendations.md b/scripts/README.recommendations.md
index df0cd554c..1bc2bdd26 100644
--- a/scripts/README.recommendations.md
+++ b/scripts/README.recommendations.md
@@ -12,7 +12,7 @@ from [ICLR webpage](https://github.com/ICLR/iclr.github.io/tree/master/recommend
    from this branch or a more recent version and copy it to `sitedata_acl2020`.
 2. Run `python scripts/embeddings.py sitedata_acl2020/papers.csv` to produce the BERT embeddings
    for the paper abstracts.
-3. Run `python reduce.py ../acl-2020-virtual-conference-sitedata/papers.csv embeddings.torch > ../sitedata_acl2020/papers_projection.json --projection-method [tsne|umap]`
+3. Run `python scripts/reduce.py --projection-method [tsne|umap] acl-2020-virtual-conference-sitedata/papers.csv embeddings.torch > sitedata_acl2020/papers_projection.json`
    to produce a 2D projection of the BERT embeddings for visualization. `--projection-method`
    selects which dimensionality reduction technique to use.
 4. Rerun `make run` and go to the paper visualization page
diff --git a/scripts/create_papers_csv.py b/scripts/create_papers_csv.py
deleted file mode 100644
index b290ba680..000000000
--- a/scripts/create_papers_csv.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import argparse
-import pickle
-
-import numpy as np
-from sklearn.feature_extraction.text import TfidfVectorizer
-
-import openreview
-
-OLD_HEADERS = ["Submission ID", "Title", "Authors", "Abstract", "Submission Type"]
-
-
-NEW_HEADER = ["UID", "title", "authors", "abstract", "keywords", "session"]
-
-
-class CsvConverter(object):
-    def __init__(self, n_keywords=5):
-        self.n_keywords = n_keywords
-
-    def get_uid(self, entry):
-        return entry["Submission ID"]
-
-    def get_title(self, entry):
-        return entry["Title"]
-
-    def get_authors(self, entry):
-        return entry["Authors"]
-
-    def get_abstract(self, entry):
-        return entry["Abstract"]
-
-    def get_keywords(self, entry, tfidf_model):
-        scores = tfidf_model.transform([entry["Abstract"]])[0]
-        words = np.array(tfidf_model.get_feature_names())
-        sorted_scores = np.argsort(scores.data)
-        top_scores = sorted_scores[: -(self.n_keywords + 1) : -1]
-        keywords = words[scores.indices[top_scores]].tolist()
-        return "|".join(keywords)
-
-    def keyword_model(self, abstracts):
-        # Replace this if we get a list of keywords
-        # For now return top TF-IDF terms of words in abstracts
-        tfidf = TfidfVectorizer(stop_words="english").fit(abstracts)
-        return tfidf
-
-    def get_session(self, entry):
-        # FIXME: Use this as a placeholder until we get some session info
-        return entry["Submission Type"]
-
-    def parse_accepted_papers(self, tsv_file):
-        with open(tsv_file, "r") as fd:
-            lines = [l.strip().split("\t") for l in fd]
-            header, paper_info = lines[0], lines[1:]
-        papers = []
-        for paper in paper_info:
-            entry = {}
-            for i, h in enumerate(header):
-                entry[h] = paper[i]
-            papers.append(entry)
-        return papers
-
-    def convert_entries(self, entries):
-        tfidf = self.keyword_model([e["Abstract"] for e in entries])
-
-        def get_new_entry(e):
-            return (
-                e["Submission ID"],
-                e["Title"],
-                "|".join(e["Authors"].split(",")),
-                '"{}"'.format(e["Abstract"]),
-                self.get_keywords(e, tfidf),
-                # FIXME: Use this as a placeholder until session info
-                # is available
-                e["Submission Type"],
-            )
-
-        new_entries = [get_new_entry(e) for e in entries]
-        return new_entries
-
-    def convert(self, old_tsv, papers_csv, out_pickle=None):
-        old_entries = self.parse_accepted_papers(old_tsv)
-        new_entries = self.convert_entries(old_entries)
-        with open(papers_csv, "w") as fd:
-            header = ",".join(NEW_HEADER)
-            fd.write("{}\n".format(header))
-            for entry in new_entries:
-                e = ",".join(entry)
-                fd.write("{}\n".format(e))
-        if out_pickle is not None:
-            cached_or = {}
-            for entry in new_entries:
-                cached_or[entry[0]] = openreview.Note(  # id
-                    "", [], [], [], {"abstract": entry[3], "title": entry[1]}
-                )  # Hack. ICLR Recommender script accepts Openreview notes
-
-            with open(out_pickle, "wb") as fd:
-                pickle.dump(cached_or, fd)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Convert CSV from original ACL format to Miniconf "
-        "compatible format"
-    )
-    parser.add_argument("--inp", type=str, help="Original ACL CSV")
-    parser.add_argument("--out", type=str, help="papers.csv")
-    parser.add_argument(
-        "--out-pickle",
-        type=str,
-        help="Dump entries into a pickle compatible with " "ICLR Recommendation engine",
-    )
-    parser.add_argument(
-        "--n-keywords", type=int, default=3, help="Number of keywords to keep"
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-    csv_converter = CsvConverter(n_keywords=args.n_keywords)
-    csv_converter.convert(args.inp, args.out, out_pickle=args.out_pickle)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/create_recommendations_pickle.py b/scripts/create_recommendations_pickle.py
new file mode 100644
index 000000000..5e041455f
--- /dev/null
+++ b/scripts/create_recommendations_pickle.py
@@ -0,0 +1,50 @@
+import argparse
+import csv
+import pickle
+
+import numpy as np
+import openreview
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+
+def read_entries(papers_csv):
+    with open(papers_csv, "r") as fd:
+        entries = list(csv.reader(fd, skipinitialspace=True))
+        entries = entries[1:]  # skip header
+
+    return entries
+
+
+def dump_cached_or(entries, out_pickle):
+    cached_or = {}
+    for entry in entries:
+        cached_or[entry[0]] = openreview.Note(  # id
+            "", [], [], [], {"abstract": entry[3], "title": entry[1]}
+        )  # Hack. ICLR Recommender script accepts Openreview notes
+
+    with open(out_pickle, "wb") as fd:
+        pickle.dump(cached_or, fd)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Convert CSV from original ACL format to Miniconf "
+        "compatible format"
+    )
+    parser.add_argument("--inp", type=str, help="papers.csv")
+    parser.add_argument(
+        "--out",
+        type=str,
+        help="Dump entries into a pickle compatible with " "ICLR Recommendation engine",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    entries = read_entries(args.inp)
+    dump_cached_or(entries, args.out)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/reduce.py b/scripts/reduce.py
index e13f03153..0f24c5083 100644
--- a/scripts/reduce.py
+++ b/scripts/reduce.py
@@ -12,6 +12,7 @@ def parse_arguments():
     parser.add_argument("papers", default=False, help="paper file")
 
     parser.add_argument("embeddings", default=False, help="embeddings file to shrink")
+    parser.add_argument("--projection-method", default="tsne", help="[umap|tsne]")
 
     return parser.parse_args()
 
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
index cc0eb25bf..0da07c264 100644
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -1,5 +1,6 @@
 transformers
 sklearn
 umap-learn
+openreview-py
 torch==1.4.0
 ics

From b21340643c7363a52393bb159212b26dc1026304 Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Mon, 15 Jun 2020 14:47:38 +0300
Subject: [PATCH 09/14] Remove unused imports

---
 scripts/create_recommendations_pickle.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/create_recommendations_pickle.py b/scripts/create_recommendations_pickle.py
index 5e041455f..1e7ec7f26 100644
--- a/scripts/create_recommendations_pickle.py
+++ b/scripts/create_recommendations_pickle.py
@@ -2,9 +2,7 @@
 import csv
 import pickle
 
-import numpy as np
 import openreview
-from sklearn.feature_extraction.text import TfidfVectorizer
 
 
 def read_entries(papers_csv):

From 191363573dbc8f6b0828cb643496030db7230275 Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Mon, 15 Jun 2020 15:01:58 +0300
Subject: [PATCH 10/14] Ignore typecheck for openreview and umap-learn

---
 scripts/create_recommendations_pickle.py | 4 +++-
 scripts/reduce.py                        | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/create_recommendations_pickle.py b/scripts/create_recommendations_pickle.py
index 1e7ec7f26..dea19581a 100644
--- a/scripts/create_recommendations_pickle.py
+++ b/scripts/create_recommendations_pickle.py
@@ -2,7 +2,9 @@
 import csv
 import pickle
 
-import openreview
+import openreview  # type: ignore
+
+# No type hints for openreview-py package. Ignore mypy
 
 
 def read_entries(papers_csv):
diff --git a/scripts/reduce.py b/scripts/reduce.py
index 0f24c5083..5fd1d4805 100644
--- a/scripts/reduce.py
+++ b/scripts/reduce.py
@@ -4,7 +4,9 @@
 
 import sklearn.manifold
 import torch
-import umap
+import umap  # type: ignore
+
+# No type stubs for umap-learn. Ignore mypy
 
 
 def parse_arguments():

From 2a52660e974b2c08e4d3654bfb7f8ce05e1c6e32 Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Mon, 15 Jun 2020 15:40:32 +0300
Subject: [PATCH 11/14] Modify poster.html to get correct id field

---
 templates/poster.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/templates/poster.html b/templates/poster.html
index 9d2e09f5e..a03ff2a36 100644
--- a/templates/poster.html
+++ b/templates/poster.html
@@ -151,7 +151,7 @@ <h2> Similar Papers </h2>
     <div class="col-md-4 col-xs-6">
       <div class="pp-card" >
         <div class="pp-card-header" class="text-muted">
-          <a href="poster_{{openreview.content.iclr_id}}.html" class="text-muted">
+          <a href="poster_{{openreview.id}}.html" class="text-muted">
             <h5 class="card-title" align="center">{{openreview.content.title}}</h5>
           </a>
           <h6 class="card-subtitle text-muted" align="center">
@@ -159,7 +159,7 @@ <h6 class="card-subtitle text-muted" align="center">
              {{a}},
              {% endfor %}
           </h6>
-          <center><img class="cards_img" src="{{config.image_path}}/{{openreview.content.iclr_id}}.png" width="80%"/></center>
+          <center><img class="cards_img" src="{{config.image_path}}/{{openreview.id}}.png" width="80%"/></center>
         </div>
       </div>
     </div>

From 7571602d0a535d1dbe5f81340206050653397ab2 Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <geopar@central.ntua.gr>
Date: Mon, 15 Jun 2020 15:44:39 +0300
Subject: [PATCH 12/14] refactor templates/poster.html

---
 templates/poster.html | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/templates/poster.html b/templates/poster.html
index a03ff2a36..e199c2ca8 100644
--- a/templates/poster.html
+++ b/templates/poster.html
@@ -147,19 +147,19 @@ <h2> Similar Papers </h2>
 <p></p>
 <div  class="container" >
   <div class="row">
-  {% for openreview in paper_recs %}
+  {% for recommended in paper_recs %}
     <div class="col-md-4 col-xs-6">
       <div class="pp-card" >
         <div class="pp-card-header" class="text-muted">
-          <a href="poster_{{openreview.id}}.html" class="text-muted">
-            <h5 class="card-title" align="center">{{openreview.content.title}}</h5>
+          <a href="poster_{{recommended.id}}.html" class="text-muted">
+            <h5 class="card-title" align="center">{{recommended.content.title}}</h5>
           </a>
           <h6 class="card-subtitle text-muted" align="center">
-             {% for a in openreview.content.authors %}
+             {% for a in recommended.content.authors %}
              {{a}},
              {% endfor %}
           </h6>
-          <center><img class="cards_img" src="{{config.image_path}}/{{openreview.id}}.png" width="80%"/></center>
+          <center><img class="cards_img" src="{{config.image_path}}/{{recommended.id}}.png" width="80%"/></center>
         </div>
       </div>
     </div>

From 9776a15dc72852257912a388303d7e0ba67c3330 Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <georgepar.91@gmail.com>
Date: Mon, 15 Jun 2020 15:50:03 +0300
Subject: [PATCH 13/14] Update README.md

---
 scripts/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/README.md b/scripts/README.md
index fcef59000..34b4c752a 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -1,7 +1,6 @@
 This directory contains extensions to help support the mini-conf library.
 
-Follow the procedure described in [this gist](https://gist.github.com/georgepar/3d5cda48c50c6ee57f56aaea9b99603d) to obtain
-the embeddings and the paper projections.
+For the updated procedure on getting similar papers + recommendations refer to README.recommendations.md
 
 
 These include:

From b5a94a89886aecdbd65b7afb78bdeff7fb1a35b5 Mon Sep 17 00:00:00 2001
From: Giorgos Paraskevopoulos <georgepar.91@gmail.com>
Date: Mon, 15 Jun 2020 15:53:53 +0300
Subject: [PATCH 14/14] Update README.recommendations.md

---
 scripts/README.recommendations.md | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/scripts/README.recommendations.md b/scripts/README.recommendations.md
index 1bc2bdd26..6d76fc8ac 100644
--- a/scripts/README.recommendations.md
+++ b/scripts/README.recommendations.md
@@ -12,7 +12,7 @@ from [ICLR webpage](https://github.com/ICLR/iclr.github.io/tree/master/recommend
    from this branch or a more recent version and copy it to `sitedata_acl2020`.
 2. Run `python scripts/embeddings.py sitedata_acl2020/papers.csv` to produce the BERT embeddings
    for the paper abstracts.
-3. Run `python scripts/reduce.py --projection-method [tsne|umap] acl-2020-virtual-conference-sitedata/papers.csv embeddings.torch > sitedata_acl2020/papers_projection.json`
+3. Run `python scripts/reduce.py --projection-method [tsne|umap] sitedata_acl2020/papers.csv embeddings.torch > sitedata_acl2020/papers_projection.json`
    to produce a 2D projection of the BERT embeddings for visualization. `--projection-method`
    selects which dimensionality reduction technique to use.
 4. Rerun `make run` and go to the paper visualization page
@@ -20,22 +20,16 @@ from [ICLR webpage](https://github.com/ICLR/iclr.github.io/tree/master/recommend
 
 ## Produce similar paper recommendations
 
-1. Grab the
-   [acl2020\_accepted\_papers.tsv](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/acl2020_accepted_papers.tsv)
-   file.
-2. Run `python scripts/create_papers_csv.py --inp acl2020_accepted_papers.tsv --out dummy.csv --out-pickle cached_or.pkl --n-keywords 5` to produce `cached_or.pkl`.
+1. Run `python scripts/create_recommendations_pickle.py --inp sitedata_acl2020/papers.csv --out cached_or.pkl` to produce `cached_or.pkl`.
    This file is compatible with the inference scripts provided in [https://github.com/ICLR/iclr.github.io/tree/master/recommendations](https://github.com/ICLR/iclr.github.io/tree/master/recommendations)
-3. Clone [https://github.com/ICLR/iclr.github.io](https://github.com/ICLR/iclr.github.io). You will
+2. Clone [https://github.com/ICLR/iclr.github.io](https://github.com/ICLR/iclr.github.io). You will
    need `git-lfs` installed.
-4. `cp cached_or.pkl iclr.github.io && cd iclr.github.io/recommendations`
-5. Install missing requirements
-6. `python recs.py`. This will run inference using a pretrained similarity model and produce the
+3. `cp cached_or.pkl iclr.github.io && cd iclr.github.io/recommendations`
+4. Install missing requirements
+5. `python recs.py`. This will run inference using a pretrained similarity model and produce the
    `rec.pkl` file that contains the paper similarities.
-7. You can use the `iclr.github.io/data/pkl_to_json.py` script to produce the `paper_recs.json`
+6. You can use the `iclr.github.io/data/pkl_to_json.py` script to produce the `paper_recs.json`
    file that contains the similar paper recommendations that can be displayed to the website. Make
    sure to modify the filepaths to point to the correct `cached_or.pkl`, `rec.pkl`.
-8. Grab the produced `paper_recs.json` file and copy it to `sitedata_acl2020`. A version of this file
-   produced using this method is
-   [here](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/paper_recs.json)
-9. I have already modified the `poster.html` template and `main.py` to display the paper
-   recommendations in `54_add_similar_papers_graph` branch.
+7. Grab the produced `paper_recs.json` file and copy it to `sitedata_acl2020`. A version of this file
+   produced using this method is [here](https://github.com/acl-org/acl-2020-virtual-conference-sitedata/blob/add_acl2020_accepted_papers_tsv/paper_recs.json)