-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute_bayesian_average.py
334 lines (234 loc) · 9.85 KB
/
compute_bayesian_average.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
import json
from pathlib import Path
import numpy as np
from load_data import load_filtered_data
from remove_noise import simplify_comma_separated_string
def compute_game_increment_value(game):
increment_value = game['positive']
return increment_value
def compute_game_raw_score(game):
raw_score = game['positive'] / (game['positive'] + game['negative'])
return raw_score
def compute_game_num_votes(game):
num_votes = game['positive'] + game['negative']
return num_votes
def compute_dev_increment_value(dev):
increment_value = np.sum(dev['scores'])
return increment_value
def compute_dev_raw_score(dev):
raw_score = np.mean(dev['scores'])
return raw_score
def compute_dev_num_votes(dev):
num_votes = len(dev['scores'])
return num_votes
def choose_prior(data, keyword=None):
if keyword is None:
list_increment_values = [
compute_game_increment_value(game) for game in data.values()
]
list_num_votes = [compute_game_num_votes(game) for game in data.values()]
else:
list_increment_values = [
compute_dev_increment_value(dev) for dev in data.values()
]
list_num_votes = [compute_dev_num_votes(dev) for dev in data.values()]
prior = {}
prior['raw_score'] = np.sum(list_increment_values) / np.sum(list_num_votes)
prior['num_votes'] = np.mean(list_num_votes)
return prior
def compute_bayesian_average_for_an_element(element, prior, keyword=None):
if keyword is None:
raw_score = compute_game_raw_score(element)
num_votes = compute_game_num_votes(element)
else:
raw_score = compute_dev_raw_score(element)
num_votes = compute_dev_num_votes(element)
bayesian_average = (
prior['num_votes'] * prior['raw_score'] + num_votes * raw_score
) / (prior['num_votes'] + num_votes)
return bayesian_average
def compute_bayesian_average_for_every_element(data, keyword=None):
prior = choose_prior(data, keyword)
for app_id in data:
data[app_id]['bayesian_average'] = compute_bayesian_average_for_an_element(
data[app_id],
prior,
keyword,
)
return data, prior
def get_separator():
separator = ', '
return separator
def match_data_by_keyword(data, keyword='developers'):
# Objective: create a dictionary which maps developers (or publishers) to a list of appIDs
matched_data = {}
for app_id in data:
text = simplify_comma_separated_string(data[app_id][keyword])
for keyword_value in {value.strip() for value in text.split(get_separator())}:
try:
matched_data[keyword_value].append(app_id)
except KeyError:
matched_data[keyword_value] = [app_id]
return matched_data
def normalize_game_weights(game_weights):
# Objective: scale the weights so that the sum of weights is equal to the number of weights,
# which is here equal to the number of games.
#
# NB 1: The value of the sum of SCALED weights MATCHES the value of the sum of weights in the UNWEIGHTED situation,
# i.e. in the case that all the terms of the sum have identical weight, equal to 1.
#
# NB 2: This scale value is important because these weights will appear in a weighted sum, not a weighted average.
num_games = len(game_weights)
game_weights = np.multiply(game_weights, num_games / np.sum(game_weights))
return game_weights
def group_data_by_keyword(data, keyword='developers'):
# Objective: aggregate game reviews for each developer (or publisher)
matched_data = match_data_by_keyword(data, keyword)
grouped_data = {}
for keyword_value in matched_data:
grouped_data[keyword_value] = {}
grouped_data[keyword_value]['name'] = keyword_value
grouped_data[keyword_value]['positive'] = 0
grouped_data[keyword_value]['negative'] = 0
game_weights = []
for app_id in matched_data[keyword_value]:
grouped_data[keyword_value]['positive'] += data[app_id]['positive']
grouped_data[keyword_value]['negative'] += data[app_id]['negative']
game_score = data[app_id]['bayesian_average']
try:
grouped_data[keyword_value]['scores'].append(game_score)
except KeyError:
grouped_data[keyword_value]['scores'] = [game_score]
game_weights.append(compute_game_num_votes(data[app_id]))
# Once the iteration over app_ids is over, scale the weights:
grouped_data[keyword_value]['weights'] = normalize_game_weights(game_weights)
return grouped_data
def check_string(data, keyword='developers'):
# Objective: check what remains after calls to simplify_string(), so that one could try to improve simplify_string()
for app_id in data:
text = data[app_id][keyword]
text = simplify_comma_separated_string(text)
if get_separator() in text:
print(f'appID={app_id:7}\t{text}')
return
def get_ranking(data):
ranking = sorted(
data.keys(),
key=lambda element: data[element]['bayesian_average'],
reverse=True,
)
return ranking
def simplify_url_item(url_item):
simplified_url_item = url_item.replace(' ', '%20').replace(',', '%2C')
return simplified_url_item
def print_ranking(data, ranking, keyword=None, num_elements=250, markdown_format=True):
steam_store_url = 'https://store.steampowered.com/app/'
if keyword == 'developers':
steam_search_url = 'https://store.steampowered.com/search/?developer='
elif keyword == 'publishers':
steam_search_url = 'https://store.steampowered.com/search/?publisher='
else:
steam_search_url = 'https://store.steampowered.com/search/?term='
for i, element in enumerate(ranking[:num_elements]):
element_name = data[element]['name']
if markdown_format:
if keyword == 'games':
# noinspection SpellCheckingInspection
app_id = data[element]['appid']
hyperlink = (
'[' + element_name + '](' + steam_store_url + str(app_id) + ')'
)
else:
hyperlink = (
'['
+ element_name
+ ']('
+ steam_search_url
+ simplify_url_item(element_name)
+ ')'
)
else:
hyperlink = element_name
print(
f'{1 + i:4}.\t'
+ hyperlink
+ ' ({:1.3f})'.format(data[element]['bayesian_average']),
)
return True
def print_prior(prior):
hyperlink_to_github = (
' ; [reference](https://github.com/woctezuma/Steam-Bayesian-Average)'
)
print(
'Prior: score={:1.3f} ; size={:3.0f}'.format(
prior['raw_score'],
prior['num_votes'],
)
+ hyperlink_to_github,
)
return
def merge_game_scores_and_weights(grouped_data):
# Caveat: this is experimental! Weights are a hack to avoid disrupting devs with each new game release.
for keyword_value in grouped_data:
grouped_data[keyword_value]['scores'] = np.multiply(
grouped_data[keyword_value]['scores'],
grouped_data[keyword_value]['weights'],
)
return grouped_data
def run_bayesian_average_workflow(data, keyword=None, criterion='the most reliable'):
# Bayesian Average for games
enhanced_game_data, game_prior = compute_bayesian_average_for_every_element(
data,
keyword=None,
)
if keyword is None:
keyword = 'games'
criterion = 'the most acclaimed'
enhanced_data = enhanced_game_data
prior = game_prior
else:
grouped_data = group_data_by_keyword(enhanced_game_data, keyword)
if criterion.endswith('established'):
# Bayesian Averages of games are weighted for each developer (or publisher).
grouped_data = merge_game_scores_and_weights(grouped_data)
# Bayesian Average for developers (or publishers)
if criterion.endswith('reliable') or criterion.endswith('established'):
# Bayesian Averages of games are aggregated for each developer (or publisher).
enhanced_data, prior = compute_bayesian_average_for_every_element(
grouped_data,
keyword=keyword,
)
else:
# Positive and negative reviews of games are aggregated for each developer (or publisher).
enhanced_data, prior = compute_bayesian_average_for_every_element(
grouped_data,
)
print('\n# Ranking of ' + criterion + ' ' + keyword + '\n')
print_prior(prior)
ranking = get_ranking(enhanced_data)
print_ranking(enhanced_data, ranking, keyword)
return enhanced_data, prior, ranking
def main(verbose=False):
filtered_data = load_filtered_data()
print('[SteamSpy ; filtered] number of games = ' + str(len(filtered_data)))
# Game data including Bayesian average.
# NB: An unused variable is the ranking for the most acclaimed games.
enhanced_data, _, _ = run_bayesian_average_workflow(filtered_data)
# Optionally, for uses in other projects, export the database including Bayesian averages:
temp_filename = 'data/game_data.json'
with Path(temp_filename).open('w', encoding='utf8') as f:
json.dump(enhanced_data, f)
if verbose:
for keyword in ['developers', 'publishers']:
check_string(enhanced_data, keyword)
# Rankings for developers and publishers
for criterion in [
'the most acclaimed',
'the most reliable',
'the most established',
]:
for keyword in ['developers', 'publishers']:
run_bayesian_average_workflow(enhanced_data, keyword, criterion)
return True
if __name__ == '__main__':
main()