-
Notifications
You must be signed in to change notification settings - Fork 65
/
generatenicelda.py
139 lines (114 loc) · 4.32 KB
/
generatenicelda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# creates the nice .html page
# assumes that pdftowordcloud.py, pdftothumbs.py and scrape.py were already run
import cPickle as pickle
from numpy import argmax, zeros, ones
from math import log
# load the pickle of papers scraped from the HTML page (result of scrape.py)
paperdict = pickle.load(open( "papers.p", "rb" ))
print "Loaded %d papers from papers.p (generated by scrape.py)" % (len(paperdict), )
# load the top word frequencies (result of pdftowordcloud.py)
topdict = pickle.load(open("topwords.p", "rb"))
print "Loaded %d entries from topwords.p (generated by pdftowordcloud.py)" % (len(topdict), )
# load LDA words and invert their dictionary list
(ldak, phi, voca) = pickle.load(open("ldaphi.p", "rb"))
wtoid = {}
for i,w in enumerate(voca):
wtoid[w] = i
# compute pairwise distances between papers based on top words
# using something similar to tfidf, but simpler. No vectors
# will be normalized or otherwise harmed during this computation.
# first compute inverse document frequency (idf)
N = len(paperdict) # number of documents
idf = {}
for pid,p in enumerate(paperdict):
tw = topdict.get(p, []) # top 100 words
ts = [x[0] for x in tw]
for t in ts:
idf[t] = idf.get(t, 0.0) + 1.0
for t in idf:
idf[t] = log(N/idf[t], 2)
# now compute weighted intersection
ds = zeros((N, N))
for pid,p in enumerate(paperdict):
tw = topdict.get(p, [])
w = set([x[0] for x in tw]) # just the words
accum = 0.0
for pid2, p2 in enumerate(paperdict):
if pid2<pid: continue
tw2= topdict.get(p2, [])
w2 = set([x[0] for x in tw2]) # just the words
# tw and tw2 are top 100 words as (word, count) in both papers. Compute
# the intersection!
winter = w.intersection(w2)
score = sum([idf[x] for x in winter])
ds[pid, pid2] = score
ds[pid2, pid] = score
# build up the string for html
html = open("nipsnice_template.html", "r").read()
s = ""
js = "ldadist=["
js2 = "pairdists=["
for pid, p in enumerate(paperdict):
# pid goes 1...N, p are the keys, pointing to actual paper IDs as given by NIPS, ~1...1500 with gaps
# get title, author
title, author = paperdict[p]
# create the tags string
topwords = topdict.get(p, [])
# some top100 words may not have been computed during LDA so exclude them if
# they aren't found in wtoid
t = [x[0] for x in topwords if x[0] in wtoid]
tid = [int(argmax(phi[:, wtoid[x]])) for x in t] # assign each word to class
tcat = ""
for k in range(ldak):
ws = [x for i,x in enumerate(t) if tid[i]==k]
tcat += '[<span class="t'+ `k` + '">' + ", ".join(ws) + '</span>] '
# count up the complete distribution for the entire document and build up
# a javascript vector storing all this
svec = zeros(ldak)
for w in t:
svec += phi[:, wtoid[w]]
if svec.sum() == 0:
svec = ones(ldak)/ldak;
else:
svec = svec / svec.sum() # normalize
nums = [0 for k in range(ldak)]
for k in range(ldak):
nums[k] = "%.2f" % (float(svec[k]), )
js += "[" + ",".join(nums) + "]"
if not pid == len(paperdict)-1: js += ","
# dump similarities of this document to others
scores = ["%.2f" % (float(ds[pid, i]),) for i in range(N)]
js2 += "[" + ",".join(scores) + "]"
if not pid == len(paperdict)-1: js2 += ","
# get path to thumbnails for this paper
thumbpath = "thumbs/NIPS2012_%s.pdf.jpg" % (p, )
# get links to PDF, supplementary and bibtex on NIPS servers
pdflink = "http://books.nips.cc/papers/files/nips25/NIPS2012_%s.pdf" % (p, )
bibtexlink = "http://books.nips.cc/papers/files/nips25/bibhtml/NIPS2012_%s.html" % (p, )
supplink = "http://books.nips.cc/papers/files/nips25/NIPS2012_%s.extra.zip" % (p, )
s += """
<div class="apaper" id="pid%d">
<div class="paperdesc">
<span class="ts">%s</span><br />
<span class="as">%s</span><br /><br />
</div>
<div class="dllinks">
<a href="%s">[pdf] </a>
<a href="%s">[bibtex] </a>
<a href="%s">[supplementary]<br /></a>
<span class="sim" id="sim%d">[rank by tf-idf similarity to this]</span><br />
<span class="abstr" id="ab%d">[abstract]</span>
</div>
<img src = "%s"><br />
<div class = "abstrholder" id="abholder%d"></div>
<span class="tt">%s</span>
</div>
""" % (pid, title, author, pdflink, bibtexlink, supplink, pid, int(p), thumbpath, int(p), tcat)
newhtml = html.replace("RESULTTABLE", s)
js += "]"
newhtml = newhtml.replace("LOADDISTS", js)
js2 += "]"
newhtml = newhtml.replace("PAIRDISTS", js2)
f = open("nipsnice.html", "w")
f.write(newhtml)
f.close()