mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
Using tfidf and lightfm for recs, only recommeneds the same feeds every time.
This commit is contained in:
parent
b398bed406
commit
c360682cc3
2 changed files with 109 additions and 0 deletions
107
apps/recommendations/management/commands/lightfm.py
Normal file
107
apps/recommendations/management/commands/lightfm.py
Normal file
|
@ -0,0 +1,107 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from django.contrib.auth.models import User
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from lightfm import LightFM
|
||||
from scipy.sparse import coo_matrix
|
||||
from sklearn.feature_extraction.text import TfidfTransformer
|
||||
|
||||
from apps.rss_feeds.models import Feed
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Recommend feeds based on a feed ID using LightFM"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"--path",
|
||||
"-p",
|
||||
type=str,
|
||||
required=False,
|
||||
default="docker/volumes/surprise/user_feed_rating_subs.500k.csv",
|
||||
help="Path to the CSV file containing user, feed, and rating data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--feed",
|
||||
"-f",
|
||||
type=int,
|
||||
required=True,
|
||||
help="ID of the feed for which to generate recommendations",
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
csv_path = options["path"]
|
||||
|
||||
try:
|
||||
# Load data
|
||||
data = pd.read_csv(
|
||||
csv_path, header=None, names=["user_id", "feed_id", "rating", "num_subscribers"]
|
||||
)
|
||||
|
||||
# Apply a logarithmic transformation to the ratings
|
||||
data["rating"] = np.log1p(data["rating"]) # log1p is used to ensure log(0) does not occur
|
||||
|
||||
# Create a sparse matrix
|
||||
user_feed_matrix = coo_matrix((data["rating"], (data["user_id"], data["feed_id"]))).tocsr()
|
||||
self.stdout.write("Successfully loaded and transformed data")
|
||||
|
||||
# Train the model
|
||||
model = LightFM(loss="warp")
|
||||
self.stdout.write("Training data...")
|
||||
model.fit(user_feed_matrix, epochs=30, num_threads=2)
|
||||
|
||||
def recommend_for_all_users_for_feed(feed_id, user_feed_matrix, feed_subscribers, n_items=10):
|
||||
# Apply TF-IDF transformation
|
||||
transformer = TfidfTransformer()
|
||||
tfidf_matrix = transformer.fit_transform(user_feed_matrix)
|
||||
|
||||
# Calculate the sum of tfidf scores for each feed across all users
|
||||
feed_scores = tfidf_matrix.sum(axis=0)
|
||||
|
||||
# Convert to a 1D numpy array
|
||||
feed_scores = np.array(feed_scores).squeeze()
|
||||
|
||||
# Adjust the scores based on the number of subscribers
|
||||
for idx, score in enumerate(feed_scores):
|
||||
# Apply some adjustment based on feed_subscribers
|
||||
# For example, you could decrease the score for feeds with many subscribers
|
||||
feed_scores[idx] = score / (1 + np.log1p(feed_subscribers[idx]))
|
||||
|
||||
# Sort feeds based on adjusted scores
|
||||
top_feeds_indices = np.argsort(-feed_scores)
|
||||
|
||||
# Build the list of recommended feeds
|
||||
top_feeds = []
|
||||
for idx in top_feeds_indices:
|
||||
if len(top_feeds) >= n_items:
|
||||
break
|
||||
if idx != feed_id:
|
||||
top_feeds.append(idx)
|
||||
|
||||
return top_feeds
|
||||
|
||||
# Recommend for a specific feed ID
|
||||
feed_id = options["feed"]
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS(f"Generating recommendations for feed: {Feed.get_by_id(feed_id)}")
|
||||
)
|
||||
# Create a NumPy array for the number of subscribers per feed
|
||||
max_feed_id = data["feed_id"].max()
|
||||
feed_subscribers = np.zeros(max_feed_id + 1)
|
||||
for _, row in data.iterrows():
|
||||
feed_subscribers[int(row["feed_id"])] = row["num_subscribers"]
|
||||
|
||||
# Call the recommendation function with this data
|
||||
top_recommended_feeds = recommend_for_all_users_for_feed(
|
||||
feed_id, user_feed_matrix, feed_subscribers
|
||||
)
|
||||
self.stdout.write(f"Found {len(top_recommended_feeds)} recommendations for feed ID {feed_id}")
|
||||
|
||||
for feed_id in top_recommended_feeds:
|
||||
feed = Feed.get_by_id(feed_id)
|
||||
if not feed:
|
||||
continue
|
||||
self.stdout.write(f"\tFeed: {feed}")
|
||||
|
||||
except FileNotFoundError:
|
||||
raise CommandError('File "%s" does not exist.' % csv_path)
|
|
@ -118,6 +118,8 @@ stripe==2.55.1
|
|||
subdomains==3.0.1
|
||||
scikit-learn~=1.3.1
|
||||
scikit-surprise~=1.1.3
|
||||
pandas~=2.4.1
|
||||
lightfm~=1.17
|
||||
text-unidecode==1.3
|
||||
tiktoken~=0.4.0
|
||||
toml==0.10.2
|
||||
|
|
Loading…
Add table
Reference in a new issue