From 711a6fbbfc0dba584aa423804d80308f9804fb87 Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Sat, 6 Jul 2024 08:01:28 -0400 Subject: [PATCH] Removing unused scikit-learn code. --- .vscode/settings.json | 2 +- apps/recommendations/models.py | 301 --------------------------------- apps/rss_feeds/models.py | 79 --------- config/requirements.txt | 3 +- 4 files changed, 2 insertions(+), 383 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index a3734ab3e..9a3e5ae37 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -8,7 +8,7 @@ ], "editor.formatOnSave": true, "editor.codeActionsOnSave": { - "source.organizeImports": true + "source.organizeImports": "explicit" }, "python.linting.enabled": true, "python.linting.pylintEnabled": false, diff --git a/apps/recommendations/models.py b/apps/recommendations/models.py index ad8349c58..ecde21aee 100644 --- a/apps/recommendations/models.py +++ b/apps/recommendations/models.py @@ -7,16 +7,11 @@ import mongoengine as mongo from django.contrib.auth.models import User from django.core.paginator import Paginator from django.db import models -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.metrics.pairwise import cosine_similarity, linear_kernel from apps.reader.models import UserSubscription, UserSubscriptionFolders from apps.rss_feeds.models import Feed from utils import json_functions as json -# from surprise import NMF, SVD, Dataset, KNNBasic, KNNWithMeans, Reader, accuracy -# from surprise.model_selection import cross_validate, train_test_split - class RecommendedFeed(models.Model): feed = models.ForeignKey(Feed, related_name="recommendations", on_delete=models.CASCADE) @@ -83,299 +78,3 @@ class MFeedFolder(mongo.Document): sub_folder_title = cls.feed_folder_parent(f_v, feed_id, f_k) if sub_folder_title: return sub_folder_title - - -class CollaborativelyFilteredRecommendation(models.Model): - @classmethod - def store_user_feed_data_to_file(cls, file_name, force=False, skip=None): - if not skip: - skip = 0 - if os.path.exists(file_name) and not force and skip == 0: - print(f"{file_name} exists, skipping storing data...") - return - - temp_file = open(file_name, "a+") - max_user_pk = User.objects.latest("pk").pk - for user_id in range(skip, max_user_pk): - try: - user = User.objects.get(pk=user_id) - except User.DoesNotExist: - continue - # Only include feeds with num_subscribers >= 5 - last_month = datetime.datetime.now() - datetime.timedelta(days=30) - subs = UserSubscription.objects.filter( - user=user, - feed__num_subscribers__gte=5, - feed__stories_last_month__gte=1, - feed__active_subscribers__gte=1, - feed__last_story_date__gte=last_month, - ) - for sub in subs: - well_read_score = sub.feed.well_read_score(user_id=sub.user_id)["reach_score"] - if not well_read_score: - continue - temp_file.write(f"{user.id},{sub.feed_id},{well_read_score}\n") - temp_file.flush() - if user_id % 1000 == 0: - print(f"User {user_id} saved to {file_name}") - - @classmethod - def svd(cls, trainset, testset): - model = SVD() - model.fit(trainset) - predictions = model.test(testset) - accuracy.rmse(predictions) - - return model - - @classmethod - def nmf(cls, trainset): - model = NMF() - model.fit(trainset) - return model - # cross_validate(model, data, measures=["RMSE", "MAE"], cv=5, verbose=True) - - return model - - @classmethod - def get_predicted_ratings(cls, model, user_id, all_feed_ids): - predicted_ratings = {} - for feed_id in all_feed_ids: - pred = model.predict(user_id, feed_id) - predicted_ratings[feed_id] = pred.est - return predicted_ratings - - @classmethod - def get_recommendations(cls, user_id, feed_ids, model, n=10): - # Predict ratings for all feeds - predictions = [model.predict(str(user_id), str(feed_id), verbose=False) for feed_id in feed_ids] - - # Sort by highest predicted rating - sorted_predictions = sorted(predictions, key=lambda x: x.est if x.est != 1 else 0, reverse=True) - - # Return top n feed IDs as recommendations - return [(pred.iid, pred.est) for pred in sorted_predictions[:n]] - - @classmethod - def load_knn_model(cls, file_name): - """OOM""" - print(f"Loading user item rating from {file_name}") - reader = Reader(line_format="user item rating", sep=",", rating_scale=(0, 100)) - data = Dataset.load_from_file(file_name, reader) - print(f"Training model with {data.n_users} users and {data.n_items} items") - trainset = data.build_full_trainset() - print(f"Training set has {trainset.n_users} users and {trainset.n_items} items") - - # Using KNNWithMeans to compute item-item similarities - model = KNNBasic(sim_options={"name": "cosine", "user_based": False}) - model.fit(trainset) - - return trainset, model - - @classmethod - def load_knnbasic_model(cls, file_name): - """OOM""" - reader = Reader(line_format="user item rating", sep=",", rating_scale=(0, 1)) - data = Dataset.load_from_file(file_name, reader) - trainset = data.build_full_trainset() - - # Print the number of users and items from trainset, not data - print(f"Training model with {trainset.n_users} users and {trainset.n_items} items") - - # Configure KNNBasic for item-item similarities - model = KNNBasic(sim_options={"name": "cosine", "user_based": False}) - model.fit(trainset) - - return trainset, model - - @classmethod - def get_feed_similarities(cls, trainset, model, feed_id, n=10): - """OOM""" - # Retrieve the inner id of the feed - feed_inner_id = trainset.to_inner_iid(str(feed_id)) - - # Get the top N most similar feeds - neighbors = model.get_neighbors(feed_inner_id, k=n) - similar_feeds = [trainset.to_raw_iid(inner_id) for inner_id in neighbors] - - return similar_feeds - - @classmethod - def recommend_similar_feeds_for_folder(cls, trainset, model, folder_feeds, n=10): - all_similar_feeds = defaultdict(float) - - for feed_id in folder_feeds: - similar_feeds = cls.get_feed_similarities(trainset, model, feed_id, n) - for sf in similar_feeds: - all_similar_feeds[sf] += 1 # Count occurrences for ranking - - # Sort feeds based on occurrence and take top N - sorted_feeds = sorted(all_similar_feeds, key=all_similar_feeds.get, reverse=True) - recommendations = [feed for feed in sorted_feeds if feed not in folder_feeds][:n] - - return recommendations - - @classmethod - def load_svd_model(cls, file_name): - reader = Reader(line_format="user item rating", sep=",", rating_scale=(0, 1)) - data = Dataset.load_from_file(file_name, reader) - trainset = data.build_full_trainset() - - print(f"Training SVG model") - model = SVD() - model.fit(trainset) - print(f"SVD model trained") - - return trainset, model - - @classmethod - def get_item_similarities(cls, model): - """OOM""" - # Retrieve item factor vectors (embeddings) - item_factors = model.qi - - # Compute cosine similarity between item embeddings - item_similarities = cosine_similarity(item_factors) - - return item_similarities - - # @classmethod - # def build_faiss_index(cls, model): - # # Retrieve item factor vectors (embeddings) - # item_factors = model.qi.astype("float32") # Faiss requires float32 type - - # # Build the Faiss index - # index = faiss.IndexFlatL2(item_factors.shape[1]) - # index.add(item_factors) - - # return index - - # @classmethod - # def build_faiss_ivfpq_index(cls, model, nlists=100): - # item_factors = model.qi.astype("float32") - # dim = item_factors.shape[1] - - # # Choose an M that divides dim. This is just an example, adjust as needed. - # M = 4 if dim % 4 == 0 else 8 if dim % 8 == 0 else 1 # Adjust this based on your actual dimension - - # # Quantizer and Index - # quantizer = faiss.IndexFlatL2(dim) - # index = faiss.IndexIVFPQ(quantizer, dim, nlists, M, 8) # Adjusted M - # index.train(item_factors) - # index.add(item_factors) - - # return index - - @classmethod - def recommend_similar_feeds_for_item_nnmf(cls, model, trainset, user_id, feed_ids, n=10): - users_who_liked_feeds = set() - - # Collect users who interacted with these feeds - for _, uid, _ in trainset.all_ratings(): - for feed_id in feed_ids: - if model.predict(uid, feed_id).est > 0.5: # Assuming > 0.5 implies interaction/like - users_who_liked_feeds.add(uid) - - # Predict feeds for these users - all_recommendations = {} - print(f"Number of users who liked the feeds: {len(users_who_liked_feeds)}") - for uid in users_who_liked_feeds: - user_recs = cls.get_recommendations(trainset, uid, model, n) - print(f"Recommendations for user {uid}: {user_recs}") - for rec in user_recs: - if rec not in feed_ids: # Exclude original feeds - all_recommendations[rec] = all_recommendations.get(rec, 0) + 1 - - # Sort feeds based on how many times they appear as recommendations - sorted_recommendations = sorted(all_recommendations, key=all_recommendations.get, reverse=True) - - return sorted_recommendations[:n] - - -class SubscriptionBasedRecommendation: - @classmethod - def store_user_feed_data_to_file(cls, file_name): - if os.path.exists(file_name): - print(f"{file_name} exists, skipping storing data...") - return - - temp_file = open(file_name, "w+") - users = User.objects.all().order_by("pk") - paginator = Paginator(users, 1000) - for page_num in paginator.page_range: - users = paginator.page(page_num) - for user in users: - # Only include feeds with num_subscribers >= 5 - subs = UserSubscription.objects.filter(user=user, feed__num_subscribers__gte=5) - # print(f"User {user} has {subs.count()} feeds") - for sub in subs: - temp_file.write(f"{user.id},{sub.feed_id},1\n") - print(f"Page {page_num} of {paginator.num_pages} saved to {file_name}") - temp_file.flush() - - @classmethod - def generate_user_subscription_documents(cls, file_name): - # Create a dictionary to hold each user's subscriptions - user_subscriptions = {} - - with open(file_name, "r") as f: - for line in f: - user_id, feed_id, _ = line.strip().split(",") - if user_id not in user_subscriptions: - user_subscriptions[user_id] = [] - user_subscriptions[user_id].append(feed_id) - - # Convert lists to space-separated strings - return [" ".join(feeds) for feeds in user_subscriptions.values()] - - @classmethod - def recommend_feeds_for_user(cls, user_index, user_subscriptions, n=10): - # Convert user subscriptions to TF-IDF matrix - vectorizer = TfidfVectorizer() - tfidf_matrix = vectorizer.fit_transform(user_subscriptions) - - # Compute cosine similarity between this user and all others - cosine_similarities = linear_kernel(tfidf_matrix[user_index], tfidf_matrix).flatten() - - # Get top N similar users (excluding the user itself) - similar_users = cosine_similarities.argsort()[-n - 2 : -1][::-1] # -2 to exclude the user themselves - - # Gather feed IDs from similar users - recommended_feeds = set() - for idx in similar_users: - recommended_feeds.update(set(user_subscriptions[idx].split())) - - # Remove feeds the user is already subscribed to - current_user_feeds = set(user_subscriptions[user_index].split()) - recommended_feeds = recommended_feeds - current_user_feeds - - return list(recommended_feeds) - - @classmethod - def recommend_feeds_for_feed_set(cls, feed_ids, user_subscriptions, n=10): - # Convert the list of feed IDs to a space-separated string (similar to the format in user_subscriptions) - user_profile = " ".join(feed_ids) - - # Convert user subscriptions + the new user profile to TF-IDF matrix - vectorizer = TfidfVectorizer() - tfidf_matrix = vectorizer.fit_transform(user_subscriptions + [user_profile]) - - # Compute cosine similarity between this user profile and all others - cosine_similarities = linear_kernel( - tfidf_matrix[-1], tfidf_matrix[:-1] - ).flatten() # last entry is our user profile - threshold = 0.9 # Adjust based on your data and requirements - strongly_similar_users = [idx for idx, sim in enumerate(cosine_similarities) if sim >= threshold] - - # Get top N similar users - similar_users = cosine_similarities.argsort()[-n:][::-1] - - # Gather feed IDs from similar users - recommended_feeds = set() - for idx in similar_users: - recommended_feeds.update(set(user_subscriptions[idx].split())) - - # Remove feeds that are in the user's current profile - recommended_feeds = recommended_feeds - set(feed_ids) - - return list(recommended_feeds) diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index 61a139e19..b2c074b57 100755 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -35,9 +35,6 @@ from django.urls import reverse from django.utils.encoding import DjangoUnicodeDecodeError, smart_bytes, smart_str from mongoengine.errors import ValidationError from mongoengine.queryset import NotUniqueError, OperationError, Q -from scipy.sparse import coo_matrix, csr_matrix -from sklearn.metrics.pairwise import cosine_similarity -from sklearn.neighbors import NearestNeighbors from apps.rss_feeds.tasks import PushFeeds, ScheduleCountTagsForUser, UpdateFeeds from apps.rss_feeds.text_importer import TextImporter @@ -1059,34 +1056,6 @@ class Feed(models.Model): end=" ", ) - def load_user_feed_similarity_model(self, csv_path=None, force=False): - if not csv_path: - csv_path = os.path.join(settings.DISCOVER_DATA_FOLDER, "user_feed_rating.csv") - - if not os.path.exists(csv_path): - logging.debug(f" ---> ~FRDiscover CSV file not found: {csv_path}") - return - - with open(csv_path, newline="") as csvfile: - data_reader = csv.reader(csvfile) - user_ids, feed_ids, ratings = [], [], [] - - for row in data_reader: - user_id, feed_id, rating = map(int, row) - user_ids.append(user_id) - feed_ids.append(feed_id) - ratings.append(np.log1p(float(rating))) - - user_ids = np.array(user_ids) - feed_ids = np.array(feed_ids) - ratings = np.array(ratings) - - # Create a sparse matrix - user_feed_matrix = coo_matrix((ratings, (user_ids, feed_ids))).tocsr() - logging.debug("Successfully loaded and transformed data") - - return user_feed_matrix - def count_similar_feeds(self, force=False, csv_path=None): if not force and self.similar_feeds.count(): return self.similar_feeds.all() @@ -1109,54 +1078,6 @@ class Feed(models.Model): pass return self.similar_feeds.all() - - def similarity_matrix_count_similar_feeds(self, force=False, csv_path=None): - if not force and self.similar_feeds.count(): - return self.similar_feeds.all() - - user_feed_matrix = self.load_user_feed_similarity_model(csv_path=csv_path, force=force) - - def calculate_similarity(user_feed_matrix): - # Ensure the matrix is in CSR format for efficient row-wise operations - if not isinstance(user_feed_matrix, csr_matrix): - user_feed_matrix = csr_matrix(user_feed_matrix) - - # Compute the cosine similarity matrix (result is dense, handle with care) - similarity_matrix = cosine_similarity(user_feed_matrix.T, dense_output=False) - return similarity_matrix - - def recommend_feeds_knn(feed_id, user_feed_matrix, n_items=10): - model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=n_items, n_jobs=-1) - - # Fit the model - model_knn.fit(user_feed_matrix.T) # Transpose to get feed-wise neighbors - - # Find neighbors for the specified feed - distances, indices = model_knn.kneighbors(user_feed_matrix.T[feed_id], n_neighbors=n_items + 1) - - # Exclude the feed itself and return the indices of recommended feeds - recommended_feeds = [idx for idx in indices[0] if idx != feed_id] - - return recommended_feeds[:n_items] - - # Recommend for a specific feed ID - logging.debug(f"Generating recommendations for feed: {self}") - # Create a NumPy array for the number of subscribers per feed - similarity_matrix = calculate_similarity(user_feed_matrix) - top_recommended_feeds = recommend_feeds_knn(self.pk, similarity_matrix) - logging.debug( - f"Found {len(top_recommended_feeds)} recommendations for feed {self}: {top_recommended_feeds}" - ) - - self.similar_feeds.clear() - for feed_id in top_recommended_feeds: - try: - self.similar_feeds.add(feed_id) - except IntegrityError: - logging.debug(f" ---> ~FRIntegrity error adding similar feed: {feed_id}") - pass - return self.similar_feeds.all() - def _split_favicon_color(self, color=None): if not color: color = self.favicon_color diff --git a/config/requirements.txt b/config/requirements.txt index f56901953..24526f153 100644 --- a/config/requirements.txt +++ b/config/requirements.txt @@ -101,6 +101,7 @@ redis>=4,<5 requests>=2.25.0,<3 requests-oauthlib==1.3.0 scipy==1.12.0 +sentence_transformers==3.0.1 sentry-sdk==1.44.1 sgmllib3k==1.0.0 simplejson==3.17.2 @@ -111,7 +112,6 @@ sqlparse==0.4.1 stevedore==3.3.0 stripe==2.55.1 subdomains==3.0.1 -scikit-learn~=1.3.1 text-unidecode==1.3 tiktoken~=0.4.0 toml==0.10.2 @@ -126,4 +126,3 @@ webencodings==0.5.1 XlsxWriter==1.3.7 zope.event==4.5.0 zope.interface==5.4.0 -sentence_transformers==3.0.1