From d04b4b038a44fe147d4832c37c249fe04ab2658d Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Wed, 6 Nov 2024 21:29:07 -0800 Subject: [PATCH] Adding API endpoints for discover stories. Need to finish the UI for stories. Then prob add premium limits. --- apps/rss_feeds/models.py | 17 ++++++-- apps/rss_feeds/urls.py | 1 + apps/rss_feeds/views.py | 18 +++++++++ apps/search/models.py | 87 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 3 deletions(-) diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index f7c3c4b1d..05afaa966 100755 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -2508,11 +2508,11 @@ class Feed(models.Model): seq = difflib.SequenceMatcher(None, story_content, existing_story_content) - similiar_length_min = 1000 + similar_length_min = 1000 if existing_story.story_permalink == story_link and existing_story.story_title == story.get( "title" ): - similiar_length_min = 20 + similar_length_min = 20 # Skip content check if already failed due to a timeout. This way we catch titles if lightweight: @@ -2521,7 +2521,7 @@ class Feed(models.Model): if ( seq and story_content - and len(story_content) > similiar_length_min + and len(story_content) > similar_length_min and existing_story_content and seq.real_quick_ratio() > 0.9 and seq.quick_ratio() > 0.95 @@ -3466,6 +3466,17 @@ class MStory(mongo.Document): return original_page + def fetch_similar_stories(self, feed_ids=None, offset=0, limit=5): + combined_content_vector = SearchStory.generate_combined_story_content_vector([self.story_hash]) + results = SearchStory.vector_query( + combined_content_vector, feed_ids_to_include=feed_ids, offset=offset, max_results=limit + ) + logging.debug( + f"Found {len(results)} recommendations for stories related to {self}: {[r['_source']['title'] for r in results]}" + ) + + return results + class MStarredStory(mongo.DynamicDocument): """Like MStory, but not inherited due to large overhead of _cls and _type in diff --git a/apps/rss_feeds/urls.py b/apps/rss_feeds/urls.py index 41f934385..a2f5e9e9e 100644 --- a/apps/rss_feeds/urls.py +++ b/apps/rss_feeds/urls.py @@ -28,4 +28,5 @@ urlpatterns = [ url(r"^story_changes", views.story_changes, name="story-changes"), url(r"^discover/(?P\d+)/?$", views.discover_feeds, name="discover-feed"), url(r"^discover/feeds/?$", views.discover_feeds, name="discover-feeds"), + url(r"^discover/stories/(?P.+)/?$", views.discover_stories, name="discover-stories"), ] diff --git a/apps/rss_feeds/views.py b/apps/rss_feeds/views.py index 16cae5858..caf05fb2c 100644 --- a/apps/rss_feeds/views.py +++ b/apps/rss_feeds/views.py @@ -676,3 +676,21 @@ def discover_feeds(request, feed_id=None): logging.user(request, "~FCDiscovering similar feeds, page %s: ~SB%s" % (page, similar_feed_ids)) return {"discover_feeds": discover_feeds} + + +@ajax_login_required +@json.json_view +def discover_stories(request, story_hash): + page = int(request.GET.get("page") or request.POST.get("page") or 1) + feed_ids = request.GET.getlist("feed_ids") or request.POST.getlist("feed_ids") + limit = 5 + offset = (page - 1) * limit + story, _ = MStory.find_story(story_hash=story_hash) + if not story: + return {"code": -1, "message": "Story not found.", "discover_stories": None, "failed": True} + + similar_stories = story.fetch_similar_stories(feed_ids=feed_ids, offset=offset, limit=limit) + similar_story_hashes = [result["_id"] for result in similar_stories] + stories = MStory.objects.filter(story_hash__in=similar_story_hashes) + stories = Feed.format_stories(stories) + return {"discover_stories": stories} diff --git a/apps/search/models.py b/apps/search/models.py index 691ba1f83..65ce73bb0 100644 --- a/apps/search/models.py +++ b/apps/search/models.py @@ -504,6 +504,93 @@ class SearchStory: return result_ids + @classmethod + def vector_query( + cls, query_vector, offset=0, max_results=10, feed_ids_to_include=None, feed_ids_to_exclude=None + ): + try: + cls.ES().indices.flush(index=cls.index_name()) + except elasticsearch.exceptions.NotFoundError as e: + logging.debug(f" ***> ~FRNo search server available: {e}") + return [] + + must_clauses = [ + { + "script_score": { + "query": {"match_all": {}}, + "script": { + "source": "cosineSimilarity(params.query_vector, 'content_vector') + 1.0", + "params": {"query_vector": query_vector}, + }, + } + } + ] + must_not_clauses = [] + if feed_ids_to_include: + must_clauses.append({"terms": {"feed_id": feed_ids_to_include}}) + if feed_ids_to_exclude: + must_not_clauses.append({"terms": {"feed_id": feed_ids_to_exclude}}) + + clauses = {} + if must_clauses: + clauses["must"] = must_clauses + if must_not_clauses: + clauses["must_not"] = must_not_clauses + + body = { + "query": { + "bool": clauses, + }, + "size": max_results, + "from": offset, + } + try: + results = cls.ES().search(body=body, index=cls.index_name(), doc_type=cls.doc_type()) + except elasticsearch.exceptions.RequestError as e: + logging.debug(" ***> ~FRNo search server available for querying: %s" % e) + return [] + + logging.info( + f"~FGVector search ~FCstories~FG: ~SB{max_results}~SN requested{f'~SB offset {offset}~SN' if offset else ''}, ~SB{len(results['hits']['hits'])}~SN results" + ) + + return results["hits"]["hits"] + + @classmethod + def fetch_story_content_vector(cls, story_hash): + # Fetch the content vector from ES for the specified story_hash + try: + cls.ES().indices.flush(index=cls.index_name()) + except elasticsearch.exceptions.NotFoundError as e: + logging.debug(f" ***> ~FRNo search server available: {e}") + return [] + + body = {"query": {"ids": {"values": [story_hash]}}} + try: + results = cls.ES().search(body=body, index=cls.index_name(), doc_type=cls.doc_type()) + except elasticsearch.exceptions.RequestError as e: + logging.debug(" ***> ~FRNo search server available for querying: %s" % e) + return [] + # logging.debug(f"Results: {results}") + if len(results["hits"]["hits"]) == 0: + logging.debug(f" ---> ~FRNo content vector found for story {story_hash}") + return [] + return results["hits"]["hits"][0]["_source"]["content_vector"] + + @classmethod + def generate_combined_story_content_vector(cls, story_hashes): + vectors = [] + for story_hash in story_hashes: + vector = cls.fetch_story_content_vector(story_hash) + if not vector: + vector = cls.generate_story_content_vector(story_hash) + vectors.append(vector) + + combined_vector = np.mean(vectors, axis=0) + normalized_combined_vector = combined_vector / np.linalg.norm(combined_vector) + + return normalized_combined_vector + @classmethod def generate_story_content_vector(cls, story_hash): from apps.rss_feeds.models import MStory