-
Notifications
You must be signed in to change notification settings - Fork 0
/
engine.py
114 lines (94 loc) · 4.69 KB
/
engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
from pyspark.mllib.recommendation import ALS
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def get_counts_and_averages(ID_and_ratings_tuple):
"""Given a tuple (gameID, ratings_iterable)
returns (gameID, (ratings_count, ratings_avg))
"""
nratings = len(ID_and_ratings_tuple[1])
return ID_and_ratings_tuple[0], (nratings, float(sum(x for x in ID_and_ratings_tuple[1])) / nratings)
class RecommendationEngine:
"""A game recommendation engine
"""
def __count_and_average_ratings(self):
"""Updates the games ratings counts from
the current data self.ratings_RDD
"""
logger.info("Counting game ratings...")
game_ID_with_ratings_RDD = self.ratings_RDD.map(lambda x: (x[1], x[2])).groupByKey()
game_ID_with_avg_ratings_RDD = game_ID_with_ratings_RDD.map(get_counts_and_averages)
self.game_rating_counts_RDD = game_ID_with_avg_ratings_RDD.map(lambda x: (x[0], x[1][0]))
def __train_model(self):
"""Train the ALS model with the current dataset
"""
logger.info("Training the ALS model...")
self.model = ALS.train(self.ratings_RDD, self.rank, seed=self.seed,
iterations=self.iterations, lambda_=self.regularization_parameter)
logger.info("ALS model built!")
def __predict_ratings(self, user_and_game_RDD):
"""Gets predictions for a given (userID, gameID) formatted RDD
Returns: an RDD with format (gameID, gameRating)
"""
predicted_RDD = self.model.predictAll(user_and_game_RDD)
predicted_rating_RDD = predicted_RDD.map(lambda x: (x.product, x.rating))
logger.info("predicted ratings: " + str(predicted_rating_RDD.takeOrdered(5)))
logger.info("rating counts: " + str(self.game_rating_counts_RDD.takeOrdered(5)))
predicted_rating_title_and_count_RDD = \
predicted_rating_RDD.join(self.game_rating_counts_RDD)
predicted_rating_title_and_count_RDD = \
predicted_rating_title_and_count_RDD.map(lambda r: (r[0], r[1][0], r[1][1]))
return predicted_rating_title_and_count_RDD
def add_ratings(self, ratings):
"""Add additional game ratings in the format (user_id, game_id, rating)
"""
# Convert ratings to an RDD
new_ratings_RDD = self.sc.parallelize(ratings)
# Add new ratings to the existing ones
self.ratings_RDD = self.original_ratings_RDD.union(new_ratings_RDD)
# Re-compute game ratings count
self.__count_and_average_ratings()
# Re-train the ALS model with the new ratings
self.__train_model()
return ratings
def get_ratings_for_game_ids(self, user_id, game_ids):
"""Given a user_id and a list of game_ids, predict ratings for them
"""
requested_games_RDD = self.sc.parallelize(game_ids).map(lambda x: (user_id, x))
# Get predicted ratings
ratings = self.__predict_ratings(requested_games_RDD).collect()
return ratings
def get_top_ratings(self, user_id, games_count):
"""Recommends up to games_count top unrated games to user_id
"""
# Get pairs of (userID, gameID) for user_id unrated games
user_unrated_games_RDD = self.ratings_RDD.filter(lambda rating: not rating[0] == user_id) \
.map(lambda x: (user_id, x[1])).distinct()
# Get predicted ratings
ratings = self.__predict_ratings(user_unrated_games_RDD).takeOrdered(games_count,
key=lambda x: -x[1])
return ratings
def __init__(self, sc, dataset_path):
"""Init the recommendation engine given a Spark context and a dataset path
"""
logger.info("Starting up the Recommendation Engine: ")
self.sc = sc
# Load ratings data for later use
logger.info("Loading Ratings data...")
ratings_file_path = os.path.join(dataset_path, 'game_ratings.csv')
ratings_raw_RDD = self.sc.textFile(ratings_file_path)
self.ratings_RDD = ratings_raw_RDD.map(lambda line: line.split(","))\
.map(lambda tokens: (int(tokens[0]), int(tokens[1]), float(tokens[2])))\
.cache()
self.original_ratings_RDD = self.ratings_RDD
logger.info("Hello")
logger.info("Ratings Data: " + str(self.ratings_RDD.takeOrdered(5)))
# Pre-calculate games ratings counts
self.__count_and_average_ratings()
# Train the model
self.rank = 8
self.seed = 5L
self.iterations = 10
self.regularization_parameter = 0.1
self.__train_model()