Adding API endpoints for discover stories. Need to finish the UI for stories. Then prob add premium limits.

2025-08-31 21:41:33 +00:00 · 2024-11-06 21:29:07 -08:00 · 2024-11-06 21:29:07 -08:00 · d04b4b038a
commit d04b4b038a
parent a2d3785c7c
4 changed files with 120 additions and 3 deletions
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -2508,11 +2508,11 @@ class Feed(models.Model):

            seq = difflib.SequenceMatcher(None, story_content, existing_story_content)

-            similiar_length_min = 1000
+            similar_length_min = 1000
            if existing_story.story_permalink == story_link and existing_story.story_title == story.get(
                "title"
            ):
-                similiar_length_min = 20
+                similar_length_min = 20

            # Skip content check if already failed due to a timeout. This way we catch titles
            if lightweight:
@ -2521,7 +2521,7 @@ class Feed(models.Model):
            if (
                seq
                and story_content
-                and len(story_content) > similiar_length_min
+                and len(story_content) > similar_length_min
                and existing_story_content
                and seq.real_quick_ratio() > 0.9
                and seq.quick_ratio() > 0.95
@ -3466,6 +3466,17 @@ class MStory(mongo.Document):

        return original_page

+    def fetch_similar_stories(self, feed_ids=None, offset=0, limit=5):
+        combined_content_vector = SearchStory.generate_combined_story_content_vector([self.story_hash])
+        results = SearchStory.vector_query(
+            combined_content_vector, feed_ids_to_include=feed_ids, offset=offset, max_results=limit
+        )
+        logging.debug(
+            f"Found {len(results)} recommendations for stories related to {self}: {[r['_source']['title'] for r in results]}"
+        )
+
+        return results
+

 class MStarredStory(mongo.DynamicDocument):
    """Like MStory, but not inherited due to large overhead of _cls and _type in
--- a/apps/rss_feeds/urls.py
+++ b/apps/rss_feeds/urls.py
@ -28,4 +28,5 @@ urlpatterns = [
    url(r"^story_changes", views.story_changes, name="story-changes"),
    url(r"^discover/(?P<feed_id>\d+)/?$", views.discover_feeds, name="discover-feed"),
    url(r"^discover/feeds/?$", views.discover_feeds, name="discover-feeds"),
+    url(r"^discover/stories/(?P<story_hash>.+)/?$", views.discover_stories, name="discover-stories"),
 ]
--- a/apps/rss_feeds/views.py
+++ b/apps/rss_feeds/views.py
@ -676,3 +676,21 @@ def discover_feeds(request, feed_id=None):

    logging.user(request, "~FCDiscovering similar feeds, page %s: ~SB%s" % (page, similar_feed_ids))
    return {"discover_feeds": discover_feeds}
+
+
+@ajax_login_required
+@json.json_view
+def discover_stories(request, story_hash):
+    page = int(request.GET.get("page") or request.POST.get("page") or 1)
+    feed_ids = request.GET.getlist("feed_ids") or request.POST.getlist("feed_ids")
+    limit = 5
+    offset = (page - 1) * limit
+    story, _ = MStory.find_story(story_hash=story_hash)
+    if not story:
+        return {"code": -1, "message": "Story not found.", "discover_stories": None, "failed": True}
+
+    similar_stories = story.fetch_similar_stories(feed_ids=feed_ids, offset=offset, limit=limit)
+    similar_story_hashes = [result["_id"] for result in similar_stories]
+    stories = MStory.objects.filter(story_hash__in=similar_story_hashes)
+    stories = Feed.format_stories(stories)
+    return {"discover_stories": stories}
--- a/apps/search/models.py
+++ b/apps/search/models.py
@ -504,6 +504,93 @@ class SearchStory:

        return result_ids

+    @classmethod
+    def vector_query(
+        cls, query_vector, offset=0, max_results=10, feed_ids_to_include=None, feed_ids_to_exclude=None
+    ):
+        try:
+            cls.ES().indices.flush(index=cls.index_name())
+        except elasticsearch.exceptions.NotFoundError as e:
+            logging.debug(f" ***> ~FRNo search server available: {e}")
+            return []
+
+        must_clauses = [
+            {
+                "script_score": {
+                    "query": {"match_all": {}},
+                    "script": {
+                        "source": "cosineSimilarity(params.query_vector, 'content_vector') + 1.0",
+                        "params": {"query_vector": query_vector},
+                    },
+                }
+            }
+        ]
+        must_not_clauses = []
+        if feed_ids_to_include:
+            must_clauses.append({"terms": {"feed_id": feed_ids_to_include}})
+        if feed_ids_to_exclude:
+            must_not_clauses.append({"terms": {"feed_id": feed_ids_to_exclude}})
+
+        clauses = {}
+        if must_clauses:
+            clauses["must"] = must_clauses
+        if must_not_clauses:
+            clauses["must_not"] = must_not_clauses
+
+        body = {
+            "query": {
+                "bool": clauses,
+            },
+            "size": max_results,
+            "from": offset,
+        }
+        try:
+            results = cls.ES().search(body=body, index=cls.index_name(), doc_type=cls.doc_type())
+        except elasticsearch.exceptions.RequestError as e:
+            logging.debug(" ***> ~FRNo search server available for querying: %s" % e)
+            return []
+
+        logging.info(
+            f"~FGVector search ~FCstories~FG: ~SB{max_results}~SN requested{f'~SB offset {offset}~SN' if offset else ''}, ~SB{len(results['hits']['hits'])}~SN results"
+        )
+
+        return results["hits"]["hits"]
+
+    @classmethod
+    def fetch_story_content_vector(cls, story_hash):
+        # Fetch the content vector from ES for the specified story_hash
+        try:
+            cls.ES().indices.flush(index=cls.index_name())
+        except elasticsearch.exceptions.NotFoundError as e:
+            logging.debug(f" ***> ~FRNo search server available: {e}")
+            return []
+
+        body = {"query": {"ids": {"values": [story_hash]}}}
+        try:
+            results = cls.ES().search(body=body, index=cls.index_name(), doc_type=cls.doc_type())
+        except elasticsearch.exceptions.RequestError as e:
+            logging.debug(" ***> ~FRNo search server available for querying: %s" % e)
+            return []
+        # logging.debug(f"Results: {results}")
+        if len(results["hits"]["hits"]) == 0:
+            logging.debug(f" ---> ~FRNo content vector found for story {story_hash}")
+            return []
+        return results["hits"]["hits"][0]["_source"]["content_vector"]
+
+    @classmethod
+    def generate_combined_story_content_vector(cls, story_hashes):
+        vectors = []
+        for story_hash in story_hashes:
+            vector = cls.fetch_story_content_vector(story_hash)
+            if not vector:
+                vector = cls.generate_story_content_vector(story_hash)
+            vectors.append(vector)
+
+        combined_vector = np.mean(vectors, axis=0)
+        normalized_combined_vector = combined_vector / np.linalg.norm(combined_vector)
+
+        return normalized_combined_vector
+
    @classmethod
    def generate_story_content_vector(cls, story_hash):
        from apps.rss_feeds.models import MStory