Splitting DiscoverStory from SearchStory so the indexes can be separate.

This commit is contained in:
Samuel Clay 2024-11-20 08:37:31 -08:00
parent cf72874318
commit d3280ff95b
2 changed files with 144 additions and 11 deletions

View file

@ -38,7 +38,7 @@ from mongoengine.queryset import NotUniqueError, OperationError, Q
from apps.rss_feeds.tasks import PushFeeds, ScheduleCountTagsForUser, UpdateFeeds
from apps.rss_feeds.text_importer import TextImporter
from apps.search.models import SearchFeed, SearchStory
from apps.search.models import DiscoverStory, SearchFeed, SearchStory
from apps.statistics.rstats import RStats
from utils import feedfinder_forman, feedfinder_pilgrim
from utils import json_functions as json
@ -3067,6 +3067,7 @@ class MStory(mongo.Document):
def index_all_for_search(cls, offset=0):
if not offset:
SearchStory.create_elasticsearch_mapping(delete=True)
DiscoverStory.create_elasticsearch_mapping(delete=True)
last_pk = Feed.objects.latest("pk").pk
for f in range(offset, last_pk, 1000):
@ -3094,12 +3095,18 @@ class MStory(mongo.Document):
story_author=self.story_author_name,
story_feed_id=self.story_feed_id,
story_date=self.story_date,
story_content_vector=SearchStory.generate_story_content_vector(self.story_hash),
)
DiscoverStory.index(
story_hash=self.story_hash,
story_feed_id=self.story_feed_id,
story_date=self.story_date,
story_content_vector=DiscoverStory.generate_story_content_vector(self.story_hash),
)
def remove_from_search_index(self):
try:
SearchStory.remove(self.story_hash)
DiscoverStory.remove(self.story_hash)
except Exception:
pass
@ -3467,8 +3474,8 @@ class MStory(mongo.Document):
return original_page
def fetch_similar_stories(self, feed_ids=None, offset=0, limit=5):
combined_content_vector = SearchStory.generate_combined_story_content_vector([self.story_hash])
results = SearchStory.vector_query(
combined_content_vector = DiscoverStory.generate_combined_story_content_vector([self.story_hash])
results = DiscoverStory.vector_query(
combined_content_vector, feed_ids_to_include=feed_ids, offset=offset, max_results=limit
)
logging.debug(

View file

@ -206,7 +206,7 @@ class MUserSearch(mongo.Document):
class SearchStory:
_es_client = None
name = "discover-stories-openai"
name = "stories"
@classmethod
def ES(cls):
@ -279,10 +279,6 @@ class SearchStory:
"store": False,
"type": "date",
},
"content_vector": {
"type": "dense_vector",
"dims": 1536, # Numbers of dims from text-embedding-3-small
},
}
cls.ES().indices.put_mapping(
body={
@ -302,7 +298,6 @@ class SearchStory:
story_author,
story_feed_id,
story_date,
story_content_vector,
):
cls.create_elasticsearch_mapping()
@ -313,7 +308,6 @@ class SearchStory:
"author": story_author,
"feed_id": story_feed_id,
"date": story_date,
"content_vector": story_content_vector,
}
try:
cls.ES().create(index=cls.index_name(), id=story_hash, body=doc, doc_type=cls.doc_type())
@ -504,6 +498,138 @@ class SearchStory:
return result_ids
class DiscoverStory:
_es_client = None
name = "discover-stories-openai"
@classmethod
def ES(cls):
if cls._es_client is None:
cls._es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_STORY_HOST)
cls.create_elasticsearch_mapping()
return cls._es_client
@classmethod
def index_name(cls):
return "%s-index" % cls.name
@classmethod
def doc_type(cls):
if settings.DOCKERBUILD or getattr(settings, "ES_IGNORE_TYPE", True):
return None
return "%s-type" % cls.name
@classmethod
def create_elasticsearch_mapping(cls, delete=False):
if delete:
logging.debug(" ---> ~FRDeleting search index for ~FM%s" % cls.index_name())
try:
cls.ES().indices.delete(cls.index_name())
except elasticsearch.exceptions.NotFoundError:
logging.debug(f" ---> ~FBCan't delete {cls.index_name()} index, doesn't exist...")
if cls.ES().indices.exists(cls.index_name()):
return
try:
cls.ES().indices.create(cls.index_name())
logging.debug(" ---> ~FCCreating search index for ~FM%s" % cls.index_name())
except elasticsearch.exceptions.RequestError as e:
logging.debug(" ***> ~FRCould not create search index for ~FM%s: %s" % (cls.index_name(), e))
return
except (
elasticsearch.exceptions.ConnectionError,
urllib3.exceptions.NewConnectionError,
urllib3.exceptions.ConnectTimeoutError,
) as e:
logging.debug(f" ***> ~FRNo search server available for creating story mapping: {e}")
return
mapping = {
"title": {
"store": False,
"type": "text",
"analyzer": "snowball",
"term_vector": "yes",
},
"content": {
"store": False,
"type": "text",
"analyzer": "snowball",
"term_vector": "yes",
},
"tags": {
"store": False,
"type": "text",
"fields": {"raw": {"type": "text", "analyzer": "keyword", "term_vector": "yes"}},
},
"author": {
"store": False,
"type": "text",
"analyzer": "default",
},
"feed_id": {"store": False, "type": "integer"},
"date": {
"store": False,
"type": "date",
},
"content_vector": {
"type": "dense_vector",
"dims": 1536, # Numbers of dims from text-embedding-3-small
},
}
cls.ES().indices.put_mapping(
body={
"properties": mapping,
},
index=cls.index_name(),
)
cls.ES().indices.flush(cls.index_name())
@classmethod
def index(
cls,
story_hash,
story_feed_id,
story_date,
story_content_vector,
):
cls.create_elasticsearch_mapping()
doc = {
"feed_id": story_feed_id,
"date": story_date,
"content_vector": story_content_vector,
}
try:
cls.ES().create(index=cls.index_name(), id=story_hash, body=doc, doc_type=cls.doc_type())
except (elasticsearch.exceptions.ConnectionError, urllib3.exceptions.NewConnectionError) as e:
logging.debug(f" ***> ~FRNo search server available for discover story indexing: {e}")
except elasticsearch.exceptions.ConflictError as e:
logging.debug(f" ***> ~FBAlready indexed discover story: {e}")
# if settings.DEBUG:
# logging.debug(f" ***> ~FBIndexed {story_hash}")
@classmethod
def remove(cls, story_hash):
if not cls.ES().exists(index=cls.index_name(), id=story_hash, doc_type=cls.doc_type()):
return
try:
cls.ES().delete(index=cls.index_name(), id=story_hash, doc_type=cls.doc_type())
except elasticsearch.exceptions.NotFoundError:
cls.ES().delete(index=cls.index_name(), id=story_hash, doc_type="story-type")
except elasticsearch.exceptions.NotFoundError as e:
logging.debug(f" ***> ~FRNo search server available for story deletion: {e}")
@classmethod
def drop(cls):
try:
cls.ES().indices.delete(cls.index_name())
except elasticsearch.exceptions.NotFoundError:
logging.debug(" ***> ~FBNo index found, nothing to drop.")
@classmethod
def vector_query(
cls, query_vector, offset=0, max_results=10, feed_ids_to_include=None, feed_ids_to_exclude=None