mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
381 lines
15 KiB
Python
381 lines
15 KiB
Python
import datetime
|
|
import os
|
|
import tempfile
|
|
from collections import defaultdict
|
|
|
|
import mongoengine as mongo
|
|
from django.contrib.auth.models import User
|
|
from django.core.paginator import Paginator
|
|
from django.db import models
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
|
|
|
|
from apps.reader.models import UserSubscription, UserSubscriptionFolders
|
|
from apps.rss_feeds.models import Feed
|
|
from utils import json_functions as json
|
|
|
|
# from surprise import NMF, SVD, Dataset, KNNBasic, KNNWithMeans, Reader, accuracy
|
|
# from surprise.model_selection import cross_validate, train_test_split
|
|
|
|
|
|
class RecommendedFeed(models.Model):
|
|
feed = models.ForeignKey(Feed, related_name="recommendations", on_delete=models.CASCADE)
|
|
user = models.ForeignKey(User, related_name="recommendations", on_delete=models.CASCADE)
|
|
description = models.TextField(null=True, blank=True)
|
|
is_public = models.BooleanField(default=False)
|
|
created_date = models.DateField(auto_now_add=True)
|
|
approved_date = models.DateField(null=True)
|
|
declined_date = models.DateField(null=True)
|
|
twitter = models.CharField(max_length=50, null=True, blank=True)
|
|
|
|
def __str__(self):
|
|
return "%s (%s)" % (self.feed, self.approved_date or self.created_date)
|
|
|
|
class Meta:
|
|
ordering = ["-approved_date", "-created_date"]
|
|
|
|
|
|
class RecommendedFeedUserFeedback(models.Model):
|
|
recommendation = models.ForeignKey(RecommendedFeed, related_name="feedback", on_delete=models.CASCADE)
|
|
user = models.ForeignKey(User, related_name="feed_feedback", on_delete=models.CASCADE)
|
|
score = models.IntegerField(default=0)
|
|
created_date = models.DateField(auto_now_add=True)
|
|
|
|
|
|
class MFeedFolder(mongo.Document):
|
|
feed_id = mongo.IntField()
|
|
folder = mongo.StringField()
|
|
count = mongo.IntField()
|
|
|
|
meta = {
|
|
"collection": "feed_folders",
|
|
"indexes": ["feed_id", "folder"],
|
|
"allow_inheritance": False,
|
|
}
|
|
|
|
def __str__(self):
|
|
feed = Feed.get_by_id(self.feed_id)
|
|
return "%s - %s (%s)" % (feed, self.folder, self.count)
|
|
|
|
@classmethod
|
|
def count_feed(cls, feed_id):
|
|
feed = Feed.get_by_id(feed_id)
|
|
print(feed)
|
|
found_folders = defaultdict(int)
|
|
user_ids = [sub["user_id"] for sub in UserSubscription.objects.filter(feed=feed).values("user_id")]
|
|
usf = UserSubscriptionFolders.objects.filter(user_id__in=user_ids)
|
|
for sub in usf:
|
|
user_sub_folders = json.decode(sub.folders)
|
|
folder_title = cls.feed_folder_parent(user_sub_folders, feed.pk)
|
|
if not folder_title:
|
|
continue
|
|
found_folders[folder_title.lower()] += 1
|
|
# print "%-20s - %s" % (folder_title if folder_title != '' else '[Top]', sub.user_id)
|
|
print(sorted(list(found_folders.items()), key=lambda f: f[1], reverse=True))
|
|
|
|
@classmethod
|
|
def feed_folder_parent(cls, folders, feed_id, folder_title=""):
|
|
for item in folders:
|
|
if isinstance(item, int) and item == feed_id:
|
|
return folder_title
|
|
elif isinstance(item, dict):
|
|
for f_k, f_v in list(item.items()):
|
|
sub_folder_title = cls.feed_folder_parent(f_v, feed_id, f_k)
|
|
if sub_folder_title:
|
|
return sub_folder_title
|
|
|
|
|
|
class CollaborativelyFilteredRecommendation(models.Model):
|
|
@classmethod
|
|
def store_user_feed_data_to_file(cls, file_name, force=False, skip=None):
|
|
if not skip:
|
|
skip = 0
|
|
if os.path.exists(file_name) and not force and skip == 0:
|
|
print(f"{file_name} exists, skipping storing data...")
|
|
return
|
|
|
|
temp_file = open(file_name, "a+")
|
|
max_user_pk = User.objects.latest("pk").pk
|
|
for user_id in range(skip, max_user_pk):
|
|
try:
|
|
user = User.objects.get(pk=user_id)
|
|
except User.DoesNotExist:
|
|
continue
|
|
# Only include feeds with num_subscribers >= 5
|
|
last_month = datetime.datetime.now() - datetime.timedelta(days=30)
|
|
subs = UserSubscription.objects.filter(
|
|
user=user,
|
|
feed__num_subscribers__gte=5,
|
|
feed__stories_last_month__gte=1,
|
|
feed__active_subscribers__gte=1,
|
|
feed__last_story_date__gte=last_month,
|
|
)
|
|
for sub in subs:
|
|
well_read_score = sub.feed.well_read_score(user_id=sub.user_id)["reach_score"]
|
|
if not well_read_score:
|
|
continue
|
|
temp_file.write(f"{user.id},{sub.feed_id},{well_read_score}\n")
|
|
temp_file.flush()
|
|
if user_id % 1000 == 0:
|
|
print(f"User {user_id} saved to {file_name}")
|
|
|
|
@classmethod
|
|
def svd(cls, trainset, testset):
|
|
model = SVD()
|
|
model.fit(trainset)
|
|
predictions = model.test(testset)
|
|
accuracy.rmse(predictions)
|
|
|
|
return model
|
|
|
|
@classmethod
|
|
def nmf(cls, trainset):
|
|
model = NMF()
|
|
model.fit(trainset)
|
|
return model
|
|
# cross_validate(model, data, measures=["RMSE", "MAE"], cv=5, verbose=True)
|
|
|
|
return model
|
|
|
|
@classmethod
|
|
def get_predicted_ratings(cls, model, user_id, all_feed_ids):
|
|
predicted_ratings = {}
|
|
for feed_id in all_feed_ids:
|
|
pred = model.predict(user_id, feed_id)
|
|
predicted_ratings[feed_id] = pred.est
|
|
return predicted_ratings
|
|
|
|
@classmethod
|
|
def get_recommendations(cls, user_id, feed_ids, model, n=10):
|
|
# Predict ratings for all feeds
|
|
predictions = [model.predict(str(user_id), str(feed_id), verbose=False) for feed_id in feed_ids]
|
|
|
|
# Sort by highest predicted rating
|
|
sorted_predictions = sorted(predictions, key=lambda x: x.est if x.est != 1 else 0, reverse=True)
|
|
|
|
# Return top n feed IDs as recommendations
|
|
return [(pred.iid, pred.est) for pred in sorted_predictions[:n]]
|
|
|
|
@classmethod
|
|
def load_knn_model(cls, file_name):
|
|
"""OOM"""
|
|
print(f"Loading user item rating from {file_name}")
|
|
reader = Reader(line_format="user item rating", sep=",", rating_scale=(0, 100))
|
|
data = Dataset.load_from_file(file_name, reader)
|
|
print(f"Training model with {data.n_users} users and {data.n_items} items")
|
|
trainset = data.build_full_trainset()
|
|
print(f"Training set has {trainset.n_users} users and {trainset.n_items} items")
|
|
|
|
# Using KNNWithMeans to compute item-item similarities
|
|
model = KNNBasic(sim_options={"name": "cosine", "user_based": False})
|
|
model.fit(trainset)
|
|
|
|
return trainset, model
|
|
|
|
@classmethod
|
|
def load_knnbasic_model(cls, file_name):
|
|
"""OOM"""
|
|
reader = Reader(line_format="user item rating", sep=",", rating_scale=(0, 1))
|
|
data = Dataset.load_from_file(file_name, reader)
|
|
trainset = data.build_full_trainset()
|
|
|
|
# Print the number of users and items from trainset, not data
|
|
print(f"Training model with {trainset.n_users} users and {trainset.n_items} items")
|
|
|
|
# Configure KNNBasic for item-item similarities
|
|
model = KNNBasic(sim_options={"name": "cosine", "user_based": False})
|
|
model.fit(trainset)
|
|
|
|
return trainset, model
|
|
|
|
@classmethod
|
|
def get_feed_similarities(cls, trainset, model, feed_id, n=10):
|
|
"""OOM"""
|
|
# Retrieve the inner id of the feed
|
|
feed_inner_id = trainset.to_inner_iid(str(feed_id))
|
|
|
|
# Get the top N most similar feeds
|
|
neighbors = model.get_neighbors(feed_inner_id, k=n)
|
|
similar_feeds = [trainset.to_raw_iid(inner_id) for inner_id in neighbors]
|
|
|
|
return similar_feeds
|
|
|
|
@classmethod
|
|
def recommend_similar_feeds_for_folder(cls, trainset, model, folder_feeds, n=10):
|
|
all_similar_feeds = defaultdict(float)
|
|
|
|
for feed_id in folder_feeds:
|
|
similar_feeds = cls.get_feed_similarities(trainset, model, feed_id, n)
|
|
for sf in similar_feeds:
|
|
all_similar_feeds[sf] += 1 # Count occurrences for ranking
|
|
|
|
# Sort feeds based on occurrence and take top N
|
|
sorted_feeds = sorted(all_similar_feeds, key=all_similar_feeds.get, reverse=True)
|
|
recommendations = [feed for feed in sorted_feeds if feed not in folder_feeds][:n]
|
|
|
|
return recommendations
|
|
|
|
@classmethod
|
|
def load_svd_model(cls, file_name):
|
|
reader = Reader(line_format="user item rating", sep=",", rating_scale=(0, 1))
|
|
data = Dataset.load_from_file(file_name, reader)
|
|
trainset = data.build_full_trainset()
|
|
|
|
print(f"Training SVG model")
|
|
model = SVD()
|
|
model.fit(trainset)
|
|
print(f"SVD model trained")
|
|
|
|
return trainset, model
|
|
|
|
@classmethod
|
|
def get_item_similarities(cls, model):
|
|
"""OOM"""
|
|
# Retrieve item factor vectors (embeddings)
|
|
item_factors = model.qi
|
|
|
|
# Compute cosine similarity between item embeddings
|
|
item_similarities = cosine_similarity(item_factors)
|
|
|
|
return item_similarities
|
|
|
|
# @classmethod
|
|
# def build_faiss_index(cls, model):
|
|
# # Retrieve item factor vectors (embeddings)
|
|
# item_factors = model.qi.astype("float32") # Faiss requires float32 type
|
|
|
|
# # Build the Faiss index
|
|
# index = faiss.IndexFlatL2(item_factors.shape[1])
|
|
# index.add(item_factors)
|
|
|
|
# return index
|
|
|
|
# @classmethod
|
|
# def build_faiss_ivfpq_index(cls, model, nlists=100):
|
|
# item_factors = model.qi.astype("float32")
|
|
# dim = item_factors.shape[1]
|
|
|
|
# # Choose an M that divides dim. This is just an example, adjust as needed.
|
|
# M = 4 if dim % 4 == 0 else 8 if dim % 8 == 0 else 1 # Adjust this based on your actual dimension
|
|
|
|
# # Quantizer and Index
|
|
# quantizer = faiss.IndexFlatL2(dim)
|
|
# index = faiss.IndexIVFPQ(quantizer, dim, nlists, M, 8) # Adjusted M
|
|
# index.train(item_factors)
|
|
# index.add(item_factors)
|
|
|
|
# return index
|
|
|
|
@classmethod
|
|
def recommend_similar_feeds_for_item_nnmf(cls, model, trainset, user_id, feed_ids, n=10):
|
|
users_who_liked_feeds = set()
|
|
|
|
# Collect users who interacted with these feeds
|
|
for _, uid, _ in trainset.all_ratings():
|
|
for feed_id in feed_ids:
|
|
if model.predict(uid, feed_id).est > 0.5: # Assuming > 0.5 implies interaction/like
|
|
users_who_liked_feeds.add(uid)
|
|
|
|
# Predict feeds for these users
|
|
all_recommendations = {}
|
|
print(f"Number of users who liked the feeds: {len(users_who_liked_feeds)}")
|
|
for uid in users_who_liked_feeds:
|
|
user_recs = cls.get_recommendations(trainset, uid, model, n)
|
|
print(f"Recommendations for user {uid}: {user_recs}")
|
|
for rec in user_recs:
|
|
if rec not in feed_ids: # Exclude original feeds
|
|
all_recommendations[rec] = all_recommendations.get(rec, 0) + 1
|
|
|
|
# Sort feeds based on how many times they appear as recommendations
|
|
sorted_recommendations = sorted(all_recommendations, key=all_recommendations.get, reverse=True)
|
|
|
|
return sorted_recommendations[:n]
|
|
|
|
|
|
class SubscriptionBasedRecommendation:
|
|
@classmethod
|
|
def store_user_feed_data_to_file(cls, file_name):
|
|
if os.path.exists(file_name):
|
|
print(f"{file_name} exists, skipping storing data...")
|
|
return
|
|
|
|
temp_file = open(file_name, "w+")
|
|
users = User.objects.all().order_by("pk")
|
|
paginator = Paginator(users, 1000)
|
|
for page_num in paginator.page_range:
|
|
users = paginator.page(page_num)
|
|
for user in users:
|
|
# Only include feeds with num_subscribers >= 5
|
|
subs = UserSubscription.objects.filter(user=user, feed__num_subscribers__gte=5)
|
|
# print(f"User {user} has {subs.count()} feeds")
|
|
for sub in subs:
|
|
temp_file.write(f"{user.id},{sub.feed_id},1\n")
|
|
print(f"Page {page_num} of {paginator.num_pages} saved to {file_name}")
|
|
temp_file.flush()
|
|
|
|
@classmethod
|
|
def generate_user_subscription_documents(cls, file_name):
|
|
# Create a dictionary to hold each user's subscriptions
|
|
user_subscriptions = {}
|
|
|
|
with open(file_name, "r") as f:
|
|
for line in f:
|
|
user_id, feed_id, _ = line.strip().split(",")
|
|
if user_id not in user_subscriptions:
|
|
user_subscriptions[user_id] = []
|
|
user_subscriptions[user_id].append(feed_id)
|
|
|
|
# Convert lists to space-separated strings
|
|
return [" ".join(feeds) for feeds in user_subscriptions.values()]
|
|
|
|
@classmethod
|
|
def recommend_feeds_for_user(cls, user_index, user_subscriptions, n=10):
|
|
# Convert user subscriptions to TF-IDF matrix
|
|
vectorizer = TfidfVectorizer()
|
|
tfidf_matrix = vectorizer.fit_transform(user_subscriptions)
|
|
|
|
# Compute cosine similarity between this user and all others
|
|
cosine_similarities = linear_kernel(tfidf_matrix[user_index], tfidf_matrix).flatten()
|
|
|
|
# Get top N similar users (excluding the user itself)
|
|
similar_users = cosine_similarities.argsort()[-n - 2 : -1][::-1] # -2 to exclude the user themselves
|
|
|
|
# Gather feed IDs from similar users
|
|
recommended_feeds = set()
|
|
for idx in similar_users:
|
|
recommended_feeds.update(set(user_subscriptions[idx].split()))
|
|
|
|
# Remove feeds the user is already subscribed to
|
|
current_user_feeds = set(user_subscriptions[user_index].split())
|
|
recommended_feeds = recommended_feeds - current_user_feeds
|
|
|
|
return list(recommended_feeds)
|
|
|
|
@classmethod
|
|
def recommend_feeds_for_feed_set(cls, feed_ids, user_subscriptions, n=10):
|
|
# Convert the list of feed IDs to a space-separated string (similar to the format in user_subscriptions)
|
|
user_profile = " ".join(feed_ids)
|
|
|
|
# Convert user subscriptions + the new user profile to TF-IDF matrix
|
|
vectorizer = TfidfVectorizer()
|
|
tfidf_matrix = vectorizer.fit_transform(user_subscriptions + [user_profile])
|
|
|
|
# Compute cosine similarity between this user profile and all others
|
|
cosine_similarities = linear_kernel(
|
|
tfidf_matrix[-1], tfidf_matrix[:-1]
|
|
).flatten() # last entry is our user profile
|
|
threshold = 0.9 # Adjust based on your data and requirements
|
|
strongly_similar_users = [idx for idx, sim in enumerate(cosine_similarities) if sim >= threshold]
|
|
|
|
# Get top N similar users
|
|
similar_users = cosine_similarities.argsort()[-n:][::-1]
|
|
|
|
# Gather feed IDs from similar users
|
|
recommended_feeds = set()
|
|
for idx in similar_users:
|
|
recommended_feeds.update(set(user_subscriptions[idx].split()))
|
|
|
|
# Remove feeds that are in the user's current profile
|
|
recommended_feeds = recommended_feeds - set(feed_ids)
|
|
|
|
return list(recommended_feeds)
|