mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
Splitting DiscoverStory from SearchStory so the indexes can be separate.
This commit is contained in:
parent
cf72874318
commit
d3280ff95b
2 changed files with 144 additions and 11 deletions
|
@ -38,7 +38,7 @@ from mongoengine.queryset import NotUniqueError, OperationError, Q
|
|||
|
||||
from apps.rss_feeds.tasks import PushFeeds, ScheduleCountTagsForUser, UpdateFeeds
|
||||
from apps.rss_feeds.text_importer import TextImporter
|
||||
from apps.search.models import SearchFeed, SearchStory
|
||||
from apps.search.models import DiscoverStory, SearchFeed, SearchStory
|
||||
from apps.statistics.rstats import RStats
|
||||
from utils import feedfinder_forman, feedfinder_pilgrim
|
||||
from utils import json_functions as json
|
||||
|
@ -3067,6 +3067,7 @@ class MStory(mongo.Document):
|
|||
def index_all_for_search(cls, offset=0):
|
||||
if not offset:
|
||||
SearchStory.create_elasticsearch_mapping(delete=True)
|
||||
DiscoverStory.create_elasticsearch_mapping(delete=True)
|
||||
|
||||
last_pk = Feed.objects.latest("pk").pk
|
||||
for f in range(offset, last_pk, 1000):
|
||||
|
@ -3094,12 +3095,18 @@ class MStory(mongo.Document):
|
|||
story_author=self.story_author_name,
|
||||
story_feed_id=self.story_feed_id,
|
||||
story_date=self.story_date,
|
||||
story_content_vector=SearchStory.generate_story_content_vector(self.story_hash),
|
||||
)
|
||||
DiscoverStory.index(
|
||||
story_hash=self.story_hash,
|
||||
story_feed_id=self.story_feed_id,
|
||||
story_date=self.story_date,
|
||||
story_content_vector=DiscoverStory.generate_story_content_vector(self.story_hash),
|
||||
)
|
||||
|
||||
def remove_from_search_index(self):
|
||||
try:
|
||||
SearchStory.remove(self.story_hash)
|
||||
DiscoverStory.remove(self.story_hash)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
@ -3467,8 +3474,8 @@ class MStory(mongo.Document):
|
|||
return original_page
|
||||
|
||||
def fetch_similar_stories(self, feed_ids=None, offset=0, limit=5):
|
||||
combined_content_vector = SearchStory.generate_combined_story_content_vector([self.story_hash])
|
||||
results = SearchStory.vector_query(
|
||||
combined_content_vector = DiscoverStory.generate_combined_story_content_vector([self.story_hash])
|
||||
results = DiscoverStory.vector_query(
|
||||
combined_content_vector, feed_ids_to_include=feed_ids, offset=offset, max_results=limit
|
||||
)
|
||||
logging.debug(
|
||||
|
|
|
@ -206,7 +206,7 @@ class MUserSearch(mongo.Document):
|
|||
|
||||
class SearchStory:
|
||||
_es_client = None
|
||||
name = "discover-stories-openai"
|
||||
name = "stories"
|
||||
|
||||
@classmethod
|
||||
def ES(cls):
|
||||
|
@ -279,10 +279,6 @@ class SearchStory:
|
|||
"store": False,
|
||||
"type": "date",
|
||||
},
|
||||
"content_vector": {
|
||||
"type": "dense_vector",
|
||||
"dims": 1536, # Numbers of dims from text-embedding-3-small
|
||||
},
|
||||
}
|
||||
cls.ES().indices.put_mapping(
|
||||
body={
|
||||
|
@ -302,7 +298,6 @@ class SearchStory:
|
|||
story_author,
|
||||
story_feed_id,
|
||||
story_date,
|
||||
story_content_vector,
|
||||
):
|
||||
cls.create_elasticsearch_mapping()
|
||||
|
||||
|
@ -313,7 +308,6 @@ class SearchStory:
|
|||
"author": story_author,
|
||||
"feed_id": story_feed_id,
|
||||
"date": story_date,
|
||||
"content_vector": story_content_vector,
|
||||
}
|
||||
try:
|
||||
cls.ES().create(index=cls.index_name(), id=story_hash, body=doc, doc_type=cls.doc_type())
|
||||
|
@ -504,6 +498,138 @@ class SearchStory:
|
|||
|
||||
return result_ids
|
||||
|
||||
|
||||
class DiscoverStory:
|
||||
_es_client = None
|
||||
name = "discover-stories-openai"
|
||||
|
||||
@classmethod
|
||||
def ES(cls):
|
||||
if cls._es_client is None:
|
||||
cls._es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_STORY_HOST)
|
||||
cls.create_elasticsearch_mapping()
|
||||
return cls._es_client
|
||||
|
||||
@classmethod
|
||||
def index_name(cls):
|
||||
return "%s-index" % cls.name
|
||||
|
||||
@classmethod
|
||||
def doc_type(cls):
|
||||
if settings.DOCKERBUILD or getattr(settings, "ES_IGNORE_TYPE", True):
|
||||
return None
|
||||
return "%s-type" % cls.name
|
||||
|
||||
@classmethod
|
||||
def create_elasticsearch_mapping(cls, delete=False):
|
||||
if delete:
|
||||
logging.debug(" ---> ~FRDeleting search index for ~FM%s" % cls.index_name())
|
||||
try:
|
||||
cls.ES().indices.delete(cls.index_name())
|
||||
except elasticsearch.exceptions.NotFoundError:
|
||||
logging.debug(f" ---> ~FBCan't delete {cls.index_name()} index, doesn't exist...")
|
||||
|
||||
if cls.ES().indices.exists(cls.index_name()):
|
||||
return
|
||||
|
||||
try:
|
||||
cls.ES().indices.create(cls.index_name())
|
||||
logging.debug(" ---> ~FCCreating search index for ~FM%s" % cls.index_name())
|
||||
except elasticsearch.exceptions.RequestError as e:
|
||||
logging.debug(" ***> ~FRCould not create search index for ~FM%s: %s" % (cls.index_name(), e))
|
||||
return
|
||||
except (
|
||||
elasticsearch.exceptions.ConnectionError,
|
||||
urllib3.exceptions.NewConnectionError,
|
||||
urllib3.exceptions.ConnectTimeoutError,
|
||||
) as e:
|
||||
logging.debug(f" ***> ~FRNo search server available for creating story mapping: {e}")
|
||||
return
|
||||
|
||||
mapping = {
|
||||
"title": {
|
||||
"store": False,
|
||||
"type": "text",
|
||||
"analyzer": "snowball",
|
||||
"term_vector": "yes",
|
||||
},
|
||||
"content": {
|
||||
"store": False,
|
||||
"type": "text",
|
||||
"analyzer": "snowball",
|
||||
"term_vector": "yes",
|
||||
},
|
||||
"tags": {
|
||||
"store": False,
|
||||
"type": "text",
|
||||
"fields": {"raw": {"type": "text", "analyzer": "keyword", "term_vector": "yes"}},
|
||||
},
|
||||
"author": {
|
||||
"store": False,
|
||||
"type": "text",
|
||||
"analyzer": "default",
|
||||
},
|
||||
"feed_id": {"store": False, "type": "integer"},
|
||||
"date": {
|
||||
"store": False,
|
||||
"type": "date",
|
||||
},
|
||||
"content_vector": {
|
||||
"type": "dense_vector",
|
||||
"dims": 1536, # Numbers of dims from text-embedding-3-small
|
||||
},
|
||||
}
|
||||
cls.ES().indices.put_mapping(
|
||||
body={
|
||||
"properties": mapping,
|
||||
},
|
||||
index=cls.index_name(),
|
||||
)
|
||||
cls.ES().indices.flush(cls.index_name())
|
||||
|
||||
@classmethod
|
||||
def index(
|
||||
cls,
|
||||
story_hash,
|
||||
story_feed_id,
|
||||
story_date,
|
||||
story_content_vector,
|
||||
):
|
||||
cls.create_elasticsearch_mapping()
|
||||
|
||||
doc = {
|
||||
"feed_id": story_feed_id,
|
||||
"date": story_date,
|
||||
"content_vector": story_content_vector,
|
||||
}
|
||||
try:
|
||||
cls.ES().create(index=cls.index_name(), id=story_hash, body=doc, doc_type=cls.doc_type())
|
||||
except (elasticsearch.exceptions.ConnectionError, urllib3.exceptions.NewConnectionError) as e:
|
||||
logging.debug(f" ***> ~FRNo search server available for discover story indexing: {e}")
|
||||
except elasticsearch.exceptions.ConflictError as e:
|
||||
logging.debug(f" ***> ~FBAlready indexed discover story: {e}")
|
||||
# if settings.DEBUG:
|
||||
# logging.debug(f" ***> ~FBIndexed {story_hash}")
|
||||
|
||||
@classmethod
|
||||
def remove(cls, story_hash):
|
||||
if not cls.ES().exists(index=cls.index_name(), id=story_hash, doc_type=cls.doc_type()):
|
||||
return
|
||||
|
||||
try:
|
||||
cls.ES().delete(index=cls.index_name(), id=story_hash, doc_type=cls.doc_type())
|
||||
except elasticsearch.exceptions.NotFoundError:
|
||||
cls.ES().delete(index=cls.index_name(), id=story_hash, doc_type="story-type")
|
||||
except elasticsearch.exceptions.NotFoundError as e:
|
||||
logging.debug(f" ***> ~FRNo search server available for story deletion: {e}")
|
||||
|
||||
@classmethod
|
||||
def drop(cls):
|
||||
try:
|
||||
cls.ES().indices.delete(cls.index_name())
|
||||
except elasticsearch.exceptions.NotFoundError:
|
||||
logging.debug(" ***> ~FBNo index found, nothing to drop.")
|
||||
|
||||
@classmethod
|
||||
def vector_query(
|
||||
cls, query_vector, offset=0, max_results=10, feed_ids_to_include=None, feed_ids_to_exclude=None
|
||||
|
|
Loading…
Add table
Reference in a new issue