Removing unused scikit-learn code.

2025-08-05 16:58:59 +00:00 · 2024-07-06 08:01:28 -04:00 · 2024-07-06 08:01:28 -04:00 · 711a6fbbfc
commit 711a6fbbfc
parent c31893f669
4 changed files with 2 additions and 383 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -8,7 +8,7 @@
    ],
    "editor.formatOnSave": true,
    "editor.codeActionsOnSave": {
-        "source.organizeImports": true
+        "source.organizeImports": "explicit"
    },
    "python.linting.enabled": true,
    "python.linting.pylintEnabled": false,
--- a/apps/recommendations/models.py
+++ b/apps/recommendations/models.py
@ -7,16 +7,11 @@ import mongoengine as mongo
 from django.contrib.auth.models import User
 from django.core.paginator import Paginator
 from django.db import models
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

 from apps.reader.models import UserSubscription, UserSubscriptionFolders
 from apps.rss_feeds.models import Feed
 from utils import json_functions as json

-# from surprise import NMF, SVD, Dataset, KNNBasic, KNNWithMeans, Reader, accuracy
-# from surprise.model_selection import cross_validate, train_test_split
-

 class RecommendedFeed(models.Model):
    feed = models.ForeignKey(Feed, related_name="recommendations", on_delete=models.CASCADE)
@ -83,299 +78,3 @@ class MFeedFolder(mongo.Document):
                    sub_folder_title = cls.feed_folder_parent(f_v, feed_id, f_k)
                    if sub_folder_title:
                        return sub_folder_title
-
-
-class CollaborativelyFilteredRecommendation(models.Model):
-    @classmethod
-    def store_user_feed_data_to_file(cls, file_name, force=False, skip=None):
-        if not skip:
-            skip = 0
-        if os.path.exists(file_name) and not force and skip == 0:
-            print(f"{file_name} exists, skipping storing data...")
-            return
-
-        temp_file = open(file_name, "a+")
-        max_user_pk = User.objects.latest("pk").pk
-        for user_id in range(skip, max_user_pk):
-            try:
-                user = User.objects.get(pk=user_id)
-            except User.DoesNotExist:
-                continue
-            # Only include feeds with num_subscribers >= 5
-            last_month = datetime.datetime.now() - datetime.timedelta(days=30)
-            subs = UserSubscription.objects.filter(
-                user=user,
-                feed__num_subscribers__gte=5,
-                feed__stories_last_month__gte=1,
-                feed__active_subscribers__gte=1,
-                feed__last_story_date__gte=last_month,
-            )
-            for sub in subs:
-                well_read_score = sub.feed.well_read_score(user_id=sub.user_id)["reach_score"]
-                if not well_read_score:
-                    continue
-                temp_file.write(f"{user.id},{sub.feed_id},{well_read_score}\n")
-            temp_file.flush()
-            if user_id % 1000 == 0:
-                print(f"User {user_id} saved to {file_name}")
-
-    @classmethod
-    def svd(cls, trainset, testset):
-        model = SVD()
-        model.fit(trainset)
-        predictions = model.test(testset)
-        accuracy.rmse(predictions)
-
-        return model
-
-    @classmethod
-    def nmf(cls, trainset):
-        model = NMF()
-        model.fit(trainset)
-        return model
-        # cross_validate(model, data, measures=["RMSE", "MAE"], cv=5, verbose=True)
-
-        return model
-
-    @classmethod
-    def get_predicted_ratings(cls, model, user_id, all_feed_ids):
-        predicted_ratings = {}
-        for feed_id in all_feed_ids:
-            pred = model.predict(user_id, feed_id)
-            predicted_ratings[feed_id] = pred.est
-        return predicted_ratings
-
-    @classmethod
-    def get_recommendations(cls, user_id, feed_ids, model, n=10):
-        # Predict ratings for all feeds
-        predictions = [model.predict(str(user_id), str(feed_id), verbose=False) for feed_id in feed_ids]
-
-        # Sort by highest predicted rating
-        sorted_predictions = sorted(predictions, key=lambda x: x.est if x.est != 1 else 0, reverse=True)
-
-        # Return top n feed IDs as recommendations
-        return [(pred.iid, pred.est) for pred in sorted_predictions[:n]]
-
-    @classmethod
-    def load_knn_model(cls, file_name):
-        """OOM"""
-        print(f"Loading user item rating from {file_name}")
-        reader = Reader(line_format="user item rating", sep=",", rating_scale=(0, 100))
-        data = Dataset.load_from_file(file_name, reader)
-        print(f"Training model with {data.n_users} users and {data.n_items} items")
-        trainset = data.build_full_trainset()
-        print(f"Training set has {trainset.n_users} users and {trainset.n_items} items")
-
-        # Using KNNWithMeans to compute item-item similarities
-        model = KNNBasic(sim_options={"name": "cosine", "user_based": False})
-        model.fit(trainset)
-
-        return trainset, model
-
-    @classmethod
-    def load_knnbasic_model(cls, file_name):
-        """OOM"""
-        reader = Reader(line_format="user item rating", sep=",", rating_scale=(0, 1))
-        data = Dataset.load_from_file(file_name, reader)
-        trainset = data.build_full_trainset()
-
-        # Print the number of users and items from trainset, not data
-        print(f"Training model with {trainset.n_users} users and {trainset.n_items} items")
-
-        # Configure KNNBasic for item-item similarities
-        model = KNNBasic(sim_options={"name": "cosine", "user_based": False})
-        model.fit(trainset)
-
-        return trainset, model
-
-    @classmethod
-    def get_feed_similarities(cls, trainset, model, feed_id, n=10):
-        """OOM"""
-        # Retrieve the inner id of the feed
-        feed_inner_id = trainset.to_inner_iid(str(feed_id))
-
-        # Get the top N most similar feeds
-        neighbors = model.get_neighbors(feed_inner_id, k=n)
-        similar_feeds = [trainset.to_raw_iid(inner_id) for inner_id in neighbors]
-
-        return similar_feeds
-
-    @classmethod
-    def recommend_similar_feeds_for_folder(cls, trainset, model, folder_feeds, n=10):
-        all_similar_feeds = defaultdict(float)
-
-        for feed_id in folder_feeds:
-            similar_feeds = cls.get_feed_similarities(trainset, model, feed_id, n)
-            for sf in similar_feeds:
-                all_similar_feeds[sf] += 1  # Count occurrences for ranking
-
-        # Sort feeds based on occurrence and take top N
-        sorted_feeds = sorted(all_similar_feeds, key=all_similar_feeds.get, reverse=True)
-        recommendations = [feed for feed in sorted_feeds if feed not in folder_feeds][:n]
-
-        return recommendations
-
-    @classmethod
-    def load_svd_model(cls, file_name):
-        reader = Reader(line_format="user item rating", sep=",", rating_scale=(0, 1))
-        data = Dataset.load_from_file(file_name, reader)
-        trainset = data.build_full_trainset()
-
-        print(f"Training SVG model")
-        model = SVD()
-        model.fit(trainset)
-        print(f"SVD model trained")
-
-        return trainset, model
-
-    @classmethod
-    def get_item_similarities(cls, model):
-        """OOM"""
-        # Retrieve item factor vectors (embeddings)
-        item_factors = model.qi
-
-        # Compute cosine similarity between item embeddings
-        item_similarities = cosine_similarity(item_factors)
-
-        return item_similarities
-
-    # @classmethod
-    # def build_faiss_index(cls, model):
-    #     # Retrieve item factor vectors (embeddings)
-    #     item_factors = model.qi.astype("float32")  # Faiss requires float32 type
-
-    #     # Build the Faiss index
-    #     index = faiss.IndexFlatL2(item_factors.shape[1])
-    #     index.add(item_factors)
-
-    #     return index
-
-    # @classmethod
-    # def build_faiss_ivfpq_index(cls, model, nlists=100):
-    #     item_factors = model.qi.astype("float32")
-    #     dim = item_factors.shape[1]
-
-    #     # Choose an M that divides dim. This is just an example, adjust as needed.
-    #     M = 4 if dim % 4 == 0 else 8 if dim % 8 == 0 else 1  # Adjust this based on your actual dimension
-
-    #     # Quantizer and Index
-    #     quantizer = faiss.IndexFlatL2(dim)
-    #     index = faiss.IndexIVFPQ(quantizer, dim, nlists, M, 8)  # Adjusted M
-    #     index.train(item_factors)
-    #     index.add(item_factors)
-
-    #     return index
-
-    @classmethod
-    def recommend_similar_feeds_for_item_nnmf(cls, model, trainset, user_id, feed_ids, n=10):
-        users_who_liked_feeds = set()
-
-        # Collect users who interacted with these feeds
-        for _, uid, _ in trainset.all_ratings():
-            for feed_id in feed_ids:
-                if model.predict(uid, feed_id).est > 0.5:  # Assuming > 0.5 implies interaction/like
-                    users_who_liked_feeds.add(uid)
-
-        # Predict feeds for these users
-        all_recommendations = {}
-        print(f"Number of users who liked the feeds: {len(users_who_liked_feeds)}")
-        for uid in users_who_liked_feeds:
-            user_recs = cls.get_recommendations(trainset, uid, model, n)
-            print(f"Recommendations for user {uid}: {user_recs}")
-            for rec in user_recs:
-                if rec not in feed_ids:  # Exclude original feeds
-                    all_recommendations[rec] = all_recommendations.get(rec, 0) + 1
-
-        # Sort feeds based on how many times they appear as recommendations
-        sorted_recommendations = sorted(all_recommendations, key=all_recommendations.get, reverse=True)
-
-        return sorted_recommendations[:n]
-
-
-class SubscriptionBasedRecommendation:
-    @classmethod
-    def store_user_feed_data_to_file(cls, file_name):
-        if os.path.exists(file_name):
-            print(f"{file_name} exists, skipping storing data...")
-            return
-
-        temp_file = open(file_name, "w+")
-        users = User.objects.all().order_by("pk")
-        paginator = Paginator(users, 1000)
-        for page_num in paginator.page_range:
-            users = paginator.page(page_num)
-            for user in users:
-                # Only include feeds with num_subscribers >= 5
-                subs = UserSubscription.objects.filter(user=user, feed__num_subscribers__gte=5)
-                # print(f"User {user} has {subs.count()} feeds")
-                for sub in subs:
-                    temp_file.write(f"{user.id},{sub.feed_id},1\n")
-            print(f"Page {page_num} of {paginator.num_pages} saved to {file_name}")
-            temp_file.flush()
-
-    @classmethod
-    def generate_user_subscription_documents(cls, file_name):
-        # Create a dictionary to hold each user's subscriptions
-        user_subscriptions = {}
-
-        with open(file_name, "r") as f:
-            for line in f:
-                user_id, feed_id, _ = line.strip().split(",")
-                if user_id not in user_subscriptions:
-                    user_subscriptions[user_id] = []
-                user_subscriptions[user_id].append(feed_id)
-
-        # Convert lists to space-separated strings
-        return [" ".join(feeds) for feeds in user_subscriptions.values()]
-
-    @classmethod
-    def recommend_feeds_for_user(cls, user_index, user_subscriptions, n=10):
-        # Convert user subscriptions to TF-IDF matrix
-        vectorizer = TfidfVectorizer()
-        tfidf_matrix = vectorizer.fit_transform(user_subscriptions)
-
-        # Compute cosine similarity between this user and all others
-        cosine_similarities = linear_kernel(tfidf_matrix[user_index], tfidf_matrix).flatten()
-
-        # Get top N similar users (excluding the user itself)
-        similar_users = cosine_similarities.argsort()[-n - 2 : -1][::-1]  # -2 to exclude the user themselves
-
-        # Gather feed IDs from similar users
-        recommended_feeds = set()
-        for idx in similar_users:
-            recommended_feeds.update(set(user_subscriptions[idx].split()))
-
-        # Remove feeds the user is already subscribed to
-        current_user_feeds = set(user_subscriptions[user_index].split())
-        recommended_feeds = recommended_feeds - current_user_feeds
-
-        return list(recommended_feeds)
-
-    @classmethod
-    def recommend_feeds_for_feed_set(cls, feed_ids, user_subscriptions, n=10):
-        # Convert the list of feed IDs to a space-separated string (similar to the format in user_subscriptions)
-        user_profile = " ".join(feed_ids)
-
-        # Convert user subscriptions + the new user profile to TF-IDF matrix
-        vectorizer = TfidfVectorizer()
-        tfidf_matrix = vectorizer.fit_transform(user_subscriptions + [user_profile])
-
-        # Compute cosine similarity between this user profile and all others
-        cosine_similarities = linear_kernel(
-            tfidf_matrix[-1], tfidf_matrix[:-1]
-        ).flatten()  # last entry is our user profile
-        threshold = 0.9  # Adjust based on your data and requirements
-        strongly_similar_users = [idx for idx, sim in enumerate(cosine_similarities) if sim >= threshold]
-
-        # Get top N similar users
-        similar_users = cosine_similarities.argsort()[-n:][::-1]
-
-        # Gather feed IDs from similar users
-        recommended_feeds = set()
-        for idx in similar_users:
-            recommended_feeds.update(set(user_subscriptions[idx].split()))
-
-        # Remove feeds that are in the user's current profile
-        recommended_feeds = recommended_feeds - set(feed_ids)
-
-        return list(recommended_feeds)
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -35,9 +35,6 @@ from django.urls import reverse
 from django.utils.encoding import DjangoUnicodeDecodeError, smart_bytes, smart_str
 from mongoengine.errors import ValidationError
 from mongoengine.queryset import NotUniqueError, OperationError, Q
-from scipy.sparse import coo_matrix, csr_matrix
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.neighbors import NearestNeighbors

 from apps.rss_feeds.tasks import PushFeeds, ScheduleCountTagsForUser, UpdateFeeds
 from apps.rss_feeds.text_importer import TextImporter
@ -1059,34 +1056,6 @@ class Feed(models.Model):
                    end=" ",
                )

-    def load_user_feed_similarity_model(self, csv_path=None, force=False):
-        if not csv_path:
-            csv_path = os.path.join(settings.DISCOVER_DATA_FOLDER, "user_feed_rating.csv")
-
-        if not os.path.exists(csv_path):
-            logging.debug(f" ---> ~FRDiscover CSV file not found: {csv_path}")
-            return
-
-        with open(csv_path, newline="") as csvfile:
-            data_reader = csv.reader(csvfile)
-            user_ids, feed_ids, ratings = [], [], []
-
-            for row in data_reader:
-                user_id, feed_id, rating = map(int, row)
-                user_ids.append(user_id)
-                feed_ids.append(feed_id)
-                ratings.append(np.log1p(float(rating)))
-
-        user_ids = np.array(user_ids)
-        feed_ids = np.array(feed_ids)
-        ratings = np.array(ratings)
-
-        # Create a sparse matrix
-        user_feed_matrix = coo_matrix((ratings, (user_ids, feed_ids))).tocsr()
-        logging.debug("Successfully loaded and transformed data")
-
-        return user_feed_matrix
-
    def count_similar_feeds(self, force=False, csv_path=None):
        if not force and self.similar_feeds.count():
            return self.similar_feeds.all()
@ -1109,54 +1078,6 @@ class Feed(models.Model):
                pass
        return self.similar_feeds.all()

-        
-    def similarity_matrix_count_similar_feeds(self, force=False, csv_path=None):
-        if not force and self.similar_feeds.count():
-            return self.similar_feeds.all()
-
-        user_feed_matrix = self.load_user_feed_similarity_model(csv_path=csv_path, force=force)
-
-        def calculate_similarity(user_feed_matrix):
-            # Ensure the matrix is in CSR format for efficient row-wise operations
-            if not isinstance(user_feed_matrix, csr_matrix):
-                user_feed_matrix = csr_matrix(user_feed_matrix)
-
-            # Compute the cosine similarity matrix (result is dense, handle with care)
-            similarity_matrix = cosine_similarity(user_feed_matrix.T, dense_output=False)
-            return similarity_matrix
-
-        def recommend_feeds_knn(feed_id, user_feed_matrix, n_items=10):
-            model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=n_items, n_jobs=-1)
-
-            # Fit the model
-            model_knn.fit(user_feed_matrix.T)  # Transpose to get feed-wise neighbors
-
-            # Find neighbors for the specified feed
-            distances, indices = model_knn.kneighbors(user_feed_matrix.T[feed_id], n_neighbors=n_items + 1)
-
-            # Exclude the feed itself and return the indices of recommended feeds
-            recommended_feeds = [idx for idx in indices[0] if idx != feed_id]
-
-            return recommended_feeds[:n_items]
-
-        # Recommend for a specific feed ID
-        logging.debug(f"Generating recommendations for feed: {self}")
-        # Create a NumPy array for the number of subscribers per feed
-        similarity_matrix = calculate_similarity(user_feed_matrix)
-        top_recommended_feeds = recommend_feeds_knn(self.pk, similarity_matrix)
-        logging.debug(
-            f"Found {len(top_recommended_feeds)} recommendations for feed {self}: {top_recommended_feeds}"
-        )
-
-        self.similar_feeds.clear()
-        for feed_id in top_recommended_feeds:
-            try:
-                self.similar_feeds.add(feed_id)
-            except IntegrityError:
-                logging.debug(f" ---> ~FRIntegrity error adding similar feed: {feed_id}")
-                pass
-        return self.similar_feeds.all()
-
    def _split_favicon_color(self, color=None):
        if not color:
            color = self.favicon_color
--- a/config/requirements.txt
+++ b/config/requirements.txt
@ -101,6 +101,7 @@ redis>=4,<5
 requests>=2.25.0,<3
 requests-oauthlib==1.3.0
 scipy==1.12.0
+sentence_transformers==3.0.1
 sentry-sdk==1.44.1
 sgmllib3k==1.0.0
 simplejson==3.17.2
@ -111,7 +112,6 @@ sqlparse==0.4.1
 stevedore==3.3.0
 stripe==2.55.1
 subdomains==3.0.1
-scikit-learn~=1.3.1
 text-unidecode==1.3
 tiktoken~=0.4.0
 toml==0.10.2
@ -126,4 +126,3 @@ webencodings==0.5.1
 XlsxWriter==1.3.7
 zope.event==4.5.0
 zope.interface==5.4.0
-sentence_transformers==3.0.1