Splitting DiscoverStory from SearchStory so the indexes can be separate.

2025-09-18 21:50:56 +00:00 · 2024-11-20 08:37:31 -08:00 · 2024-11-20 08:37:31 -08:00 · d3280ff95b
commit d3280ff95b
parent cf72874318
2 changed files with 144 additions and 11 deletions
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -38,7 +38,7 @@ from mongoengine.queryset import NotUniqueError, OperationError, Q

 from apps.rss_feeds.tasks import PushFeeds, ScheduleCountTagsForUser, UpdateFeeds
 from apps.rss_feeds.text_importer import TextImporter
-from apps.search.models import SearchFeed, SearchStory
+from apps.search.models import DiscoverStory, SearchFeed, SearchStory
 from apps.statistics.rstats import RStats
 from utils import feedfinder_forman, feedfinder_pilgrim
 from utils import json_functions as json
@ -3067,6 +3067,7 @@ class MStory(mongo.Document):
    def index_all_for_search(cls, offset=0):
        if not offset:
            SearchStory.create_elasticsearch_mapping(delete=True)
+            DiscoverStory.create_elasticsearch_mapping(delete=True)

        last_pk = Feed.objects.latest("pk").pk
        for f in range(offset, last_pk, 1000):
@ -3094,12 +3095,18 @@ class MStory(mongo.Document):
            story_author=self.story_author_name,
            story_feed_id=self.story_feed_id,
            story_date=self.story_date,
-            story_content_vector=SearchStory.generate_story_content_vector(self.story_hash),
+        )
+        DiscoverStory.index(
+            story_hash=self.story_hash,
+            story_feed_id=self.story_feed_id,
+            story_date=self.story_date,
+            story_content_vector=DiscoverStory.generate_story_content_vector(self.story_hash),
        )

    def remove_from_search_index(self):
        try:
            SearchStory.remove(self.story_hash)
+            DiscoverStory.remove(self.story_hash)
        except Exception:
            pass

@ -3467,8 +3474,8 @@ class MStory(mongo.Document):
        return original_page

    def fetch_similar_stories(self, feed_ids=None, offset=0, limit=5):
-        combined_content_vector = SearchStory.generate_combined_story_content_vector([self.story_hash])
-        results = SearchStory.vector_query(
+        combined_content_vector = DiscoverStory.generate_combined_story_content_vector([self.story_hash])
+        results = DiscoverStory.vector_query(
            combined_content_vector, feed_ids_to_include=feed_ids, offset=offset, max_results=limit
        )
        logging.debug(
--- a/apps/search/models.py
+++ b/apps/search/models.py
@ -206,7 +206,7 @@ class MUserSearch(mongo.Document):

 class SearchStory:
    _es_client = None
-    name = "discover-stories-openai"
+    name = "stories"

    @classmethod
    def ES(cls):
@ -279,10 +279,6 @@ class SearchStory:
                "store": False,
                "type": "date",
            },
-            "content_vector": {
-                "type": "dense_vector",
-                "dims": 1536,  # Numbers of dims from text-embedding-3-small
-            },
        }
        cls.ES().indices.put_mapping(
            body={
@ -302,7 +298,6 @@ class SearchStory:
        story_author,
        story_feed_id,
        story_date,
-        story_content_vector,
    ):
        cls.create_elasticsearch_mapping()

@ -313,7 +308,6 @@ class SearchStory:
            "author": story_author,
            "feed_id": story_feed_id,
            "date": story_date,
-            "content_vector": story_content_vector,
        }
        try:
            cls.ES().create(index=cls.index_name(), id=story_hash, body=doc, doc_type=cls.doc_type())
@ -504,6 +498,138 @@ class SearchStory:

        return result_ids

+
+class DiscoverStory:
+    _es_client = None
+    name = "discover-stories-openai"
+
+    @classmethod
+    def ES(cls):
+        if cls._es_client is None:
+            cls._es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_STORY_HOST)
+            cls.create_elasticsearch_mapping()
+        return cls._es_client
+
+    @classmethod
+    def index_name(cls):
+        return "%s-index" % cls.name
+
+    @classmethod
+    def doc_type(cls):
+        if settings.DOCKERBUILD or getattr(settings, "ES_IGNORE_TYPE", True):
+            return None
+        return "%s-type" % cls.name
+
+    @classmethod
+    def create_elasticsearch_mapping(cls, delete=False):
+        if delete:
+            logging.debug(" ---> ~FRDeleting search index for ~FM%s" % cls.index_name())
+            try:
+                cls.ES().indices.delete(cls.index_name())
+            except elasticsearch.exceptions.NotFoundError:
+                logging.debug(f" ---> ~FBCan't delete {cls.index_name()} index, doesn't exist...")
+
+        if cls.ES().indices.exists(cls.index_name()):
+            return
+
+        try:
+            cls.ES().indices.create(cls.index_name())
+            logging.debug(" ---> ~FCCreating search index for ~FM%s" % cls.index_name())
+        except elasticsearch.exceptions.RequestError as e:
+            logging.debug(" ***> ~FRCould not create search index for ~FM%s: %s" % (cls.index_name(), e))
+            return
+        except (
+            elasticsearch.exceptions.ConnectionError,
+            urllib3.exceptions.NewConnectionError,
+            urllib3.exceptions.ConnectTimeoutError,
+        ) as e:
+            logging.debug(f" ***> ~FRNo search server available for creating story mapping: {e}")
+            return
+
+        mapping = {
+            "title": {
+                "store": False,
+                "type": "text",
+                "analyzer": "snowball",
+                "term_vector": "yes",
+            },
+            "content": {
+                "store": False,
+                "type": "text",
+                "analyzer": "snowball",
+                "term_vector": "yes",
+            },
+            "tags": {
+                "store": False,
+                "type": "text",
+                "fields": {"raw": {"type": "text", "analyzer": "keyword", "term_vector": "yes"}},
+            },
+            "author": {
+                "store": False,
+                "type": "text",
+                "analyzer": "default",
+            },
+            "feed_id": {"store": False, "type": "integer"},
+            "date": {
+                "store": False,
+                "type": "date",
+            },
+            "content_vector": {
+                "type": "dense_vector",
+                "dims": 1536,  # Numbers of dims from text-embedding-3-small
+            },
+        }
+        cls.ES().indices.put_mapping(
+            body={
+                "properties": mapping,
+            },
+            index=cls.index_name(),
+        )
+        cls.ES().indices.flush(cls.index_name())
+
+    @classmethod
+    def index(
+        cls,
+        story_hash,
+        story_feed_id,
+        story_date,
+        story_content_vector,
+    ):
+        cls.create_elasticsearch_mapping()
+
+        doc = {
+            "feed_id": story_feed_id,
+            "date": story_date,
+            "content_vector": story_content_vector,
+        }
+        try:
+            cls.ES().create(index=cls.index_name(), id=story_hash, body=doc, doc_type=cls.doc_type())
+        except (elasticsearch.exceptions.ConnectionError, urllib3.exceptions.NewConnectionError) as e:
+            logging.debug(f" ***> ~FRNo search server available for discover story indexing: {e}")
+        except elasticsearch.exceptions.ConflictError as e:
+            logging.debug(f" ***> ~FBAlready indexed discover story: {e}")
+        # if settings.DEBUG:
+        #     logging.debug(f" ***> ~FBIndexed {story_hash}")
+
+    @classmethod
+    def remove(cls, story_hash):
+        if not cls.ES().exists(index=cls.index_name(), id=story_hash, doc_type=cls.doc_type()):
+            return
+
+        try:
+            cls.ES().delete(index=cls.index_name(), id=story_hash, doc_type=cls.doc_type())
+        except elasticsearch.exceptions.NotFoundError:
+            cls.ES().delete(index=cls.index_name(), id=story_hash, doc_type="story-type")
+        except elasticsearch.exceptions.NotFoundError as e:
+            logging.debug(f" ***> ~FRNo search server available for story deletion: {e}")
+
+    @classmethod
+    def drop(cls):
+        try:
+            cls.ES().indices.delete(cls.index_name())
+        except elasticsearch.exceptions.NotFoundError:
+            logging.debug(" ***> ~FBNo index found, nothing to drop.")
+
    @classmethod
    def vector_query(
        cls, query_vector, offset=0, max_results=10, feed_ids_to_include=None, feed_ids_to_exclude=None