-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRecommendation.py
177 lines (154 loc) · 7.04 KB
/
Recommendation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
from pathlib import Path
import json
# Set up logging
Path("logs").mkdir(exist_ok=True)
logging.basicConfig(
filename="logs/recomendation.log",
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
# Define function to load data
def load_data(file_path):
try:
data = pd.read_csv(file_path)
logging.info(f"Data loaded successfully from {file_path}")
return data
except Exception as e:
logging.error(f"Error loading data from {file_path}: {e}")
return None
# Define function to preprocess data
def preprocess_data(merged_data):
try:
# Convert relevant columns to numeric, forcing errors to NaN
merged_data['comment_count'] = pd.to_numeric(merged_data['comment_count'], errors='coerce').fillna(0)
merged_data['upvote_count'] = pd.to_numeric(merged_data['upvote_count'], errors='coerce').fillna(0)
merged_data['view_count'] = pd.to_numeric(merged_data['view_count'], errors='coerce').fillna(0)
merged_data['share_count_x'] = pd.to_numeric(merged_data['share_count_x'], errors='coerce').fillna(0)
# Calculate engagement score
merged_data['engagement_score'] = (
merged_data['comment_count'] +
merged_data['upvote_count'] +
merged_data['view_count'] +
merged_data['share_count_x']
)
# Ensure all text columns are strings before concatenation
merged_data['title'] = merged_data['title'].astype(str)
merged_data['bio'] = merged_data['bio'].astype(str)
merged_data['category.name'] = merged_data['category.name'].astype(str)
merged_data['category.description'] = merged_data['category.description'].astype(str)
# Combine text-based columns to create a metadata summary for the content
merged_data['metadata'] = (
merged_data['title'] + ' ' +
merged_data['bio'] + ' ' +
merged_data['category.name'] + ' ' +
merged_data['category.description']
)
logging.info("Data preprocessing completed successfully.")
except Exception as e:
logging.error(f"Error during data preprocessing: {e}")
# Define function for TF-IDF vectorization
def compute_cosine_similarity(merged_data):
try:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(merged_data["metadata"])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
logging.info("Cosine similarity computed successfully.")
return cosine_sim
except Exception as e:
logging.error(f"Error during TF-IDF vectorization or cosine similarity computation: {e}")
return None
# Function to recommend content based on similarity
def recommend_content_based(video_id, cosine_sim, posts_df):
try:
idx = posts_df[posts_df["id"] == video_id].index[0]
similarity_scores = list(enumerate(cosine_sim[idx]))
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
similar_videos = [posts_df.iloc[i[0]]["video_link"] for i in similarity_scores[0:6]]
return similar_videos
except Exception as e:
logging.error(f"Error in recommend_content_based: {e}")
return []
# Define function for collaborative filtering
def recommend_collaborative(user, user_item_matrix, user_factors, item_factors, df):
try:
user_idx = user_item_matrix.index.get_loc(user)
user_vector = user_factors[user_idx]
scores = np.dot(user_vector, item_factors)
top_item_indices = np.argsort(scores)[::-1][:5]
ids = user_item_matrix.columns[top_item_indices]
recommendations = df[df["id"].isin(ids)]
return recommendations[["username", "id", "title", "video_link", "engagement_score"]].to_dict(orient="records")
except Exception as e:
logging.error(f"Error in recommend_collaborative: {e}")
return []
# Define function for new user recommendations
def recommend_new_user_videos(merged_data):
try:
top_categories = (
merged_data.groupby("category.name")["view_count"]
.sum()
.sort_values(ascending=False)
.head(6)
.index
)
category_recommendations = (
merged_data[merged_data["category.name"].isin(top_categories)]
.sort_values(by="view_count", ascending=False)
.head(6)
)
top_rated_videos = (
merged_data.sort_values(by="average_rating", ascending=False)
.head(6)
)
most_commented_videos = (
merged_data.sort_values(by="comment_count", ascending=False)
.head(6)
)
top_viewed_videos = (
merged_data.sort_values(by="view_count", ascending=False)
.head(6)
)
recommendations = pd.concat(
[category_recommendations, top_rated_videos, most_commented_videos, top_viewed_videos]
).drop_duplicates(subset="id")
return recommendations[["id", "title", "category.name", "video_link", "view_count", "average_rating", "comment_count"]].sample(5).to_dict(orient="records")
except Exception as e:
logging.error(f"Error in recommend_new_user_videos: {e}")
return []
# Define hybrid recommendation function
def recommend_hybrid(user, user_item_matrix, user_factors, item_factors, cosine_sim, posts_df):
try:
if user in user_item_matrix.index:
content = recommend_collaborative(user, user_item_matrix, user_factors, item_factors, posts_df)
user_based = recommend_collaborative(user, user_item_matrix, user_factors, item_factors, posts_df)
recommendation = pd.concat([content, user_based])
return recommendation.to_dict(orient="records")
else:
x=recommend_new_user_videos(posts_df)
return x
except Exception as e:
logging.error(f"Error in recommend_hybrid: {e}")
return []
# Main script
merged_data = load_data(r"C:\Users\Debasish Das\Desktop\Internship Project\Hybrid_rccomendation_System\data\preprocessed\merged_data.csv")
if merged_data is not None:
preprocess_data(merged_data)
cosine_sim = compute_cosine_similarity(merged_data)
# Create a User-Item Interaction Matrix
user_item_matrix = merged_data.pivot_table(index="username", columns="id", values="engagement_score", fill_value=0)
# Apply SVD
svd = TruncatedSVD(n_components=10)
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_
user = "New_user" # Test with a new user
user="Rishwanth"
recommended_videos = recommend_hybrid(user, user_item_matrix, user_factors, item_factors, cosine_sim, merged_data)
print("Printed recommendations",recommended_videos)
with open('recommendations.json', 'w') as file:
json.dump(recommended_videos, file, indent=4)