Adding API endpoints for discover stories. Need to finish the UI for stories. Then prob add premium limits.

This commit is contained in:
Samuel Clay 2024-11-06 21:29:07 -08:00
parent a2d3785c7c
commit d04b4b038a
4 changed files with 120 additions and 3 deletions

View file

@ -2508,11 +2508,11 @@ class Feed(models.Model):
seq = difflib.SequenceMatcher(None, story_content, existing_story_content)
similiar_length_min = 1000
similar_length_min = 1000
if existing_story.story_permalink == story_link and existing_story.story_title == story.get(
"title"
):
similiar_length_min = 20
similar_length_min = 20
# Skip content check if already failed due to a timeout. This way we catch titles
if lightweight:
@ -2521,7 +2521,7 @@ class Feed(models.Model):
if (
seq
and story_content
and len(story_content) > similiar_length_min
and len(story_content) > similar_length_min
and existing_story_content
and seq.real_quick_ratio() > 0.9
and seq.quick_ratio() > 0.95
@ -3466,6 +3466,17 @@ class MStory(mongo.Document):
return original_page
def fetch_similar_stories(self, feed_ids=None, offset=0, limit=5):
combined_content_vector = SearchStory.generate_combined_story_content_vector([self.story_hash])
results = SearchStory.vector_query(
combined_content_vector, feed_ids_to_include=feed_ids, offset=offset, max_results=limit
)
logging.debug(
f"Found {len(results)} recommendations for stories related to {self}: {[r['_source']['title'] for r in results]}"
)
return results
class MStarredStory(mongo.DynamicDocument):
"""Like MStory, but not inherited due to large overhead of _cls and _type in

View file

@ -28,4 +28,5 @@ urlpatterns = [
url(r"^story_changes", views.story_changes, name="story-changes"),
url(r"^discover/(?P<feed_id>\d+)/?$", views.discover_feeds, name="discover-feed"),
url(r"^discover/feeds/?$", views.discover_feeds, name="discover-feeds"),
url(r"^discover/stories/(?P<story_hash>.+)/?$", views.discover_stories, name="discover-stories"),
]

View file

@ -676,3 +676,21 @@ def discover_feeds(request, feed_id=None):
logging.user(request, "~FCDiscovering similar feeds, page %s: ~SB%s" % (page, similar_feed_ids))
return {"discover_feeds": discover_feeds}
@ajax_login_required
@json.json_view
def discover_stories(request, story_hash):
page = int(request.GET.get("page") or request.POST.get("page") or 1)
feed_ids = request.GET.getlist("feed_ids") or request.POST.getlist("feed_ids")
limit = 5
offset = (page - 1) * limit
story, _ = MStory.find_story(story_hash=story_hash)
if not story:
return {"code": -1, "message": "Story not found.", "discover_stories": None, "failed": True}
similar_stories = story.fetch_similar_stories(feed_ids=feed_ids, offset=offset, limit=limit)
similar_story_hashes = [result["_id"] for result in similar_stories]
stories = MStory.objects.filter(story_hash__in=similar_story_hashes)
stories = Feed.format_stories(stories)
return {"discover_stories": stories}

View file

@ -504,6 +504,93 @@ class SearchStory:
return result_ids
@classmethod
def vector_query(
cls, query_vector, offset=0, max_results=10, feed_ids_to_include=None, feed_ids_to_exclude=None
):
try:
cls.ES().indices.flush(index=cls.index_name())
except elasticsearch.exceptions.NotFoundError as e:
logging.debug(f" ***> ~FRNo search server available: {e}")
return []
must_clauses = [
{
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'content_vector') + 1.0",
"params": {"query_vector": query_vector},
},
}
}
]
must_not_clauses = []
if feed_ids_to_include:
must_clauses.append({"terms": {"feed_id": feed_ids_to_include}})
if feed_ids_to_exclude:
must_not_clauses.append({"terms": {"feed_id": feed_ids_to_exclude}})
clauses = {}
if must_clauses:
clauses["must"] = must_clauses
if must_not_clauses:
clauses["must_not"] = must_not_clauses
body = {
"query": {
"bool": clauses,
},
"size": max_results,
"from": offset,
}
try:
results = cls.ES().search(body=body, index=cls.index_name(), doc_type=cls.doc_type())
except elasticsearch.exceptions.RequestError as e:
logging.debug(" ***> ~FRNo search server available for querying: %s" % e)
return []
logging.info(
f"~FGVector search ~FCstories~FG: ~SB{max_results}~SN requested{f'~SB offset {offset}~SN' if offset else ''}, ~SB{len(results['hits']['hits'])}~SN results"
)
return results["hits"]["hits"]
@classmethod
def fetch_story_content_vector(cls, story_hash):
# Fetch the content vector from ES for the specified story_hash
try:
cls.ES().indices.flush(index=cls.index_name())
except elasticsearch.exceptions.NotFoundError as e:
logging.debug(f" ***> ~FRNo search server available: {e}")
return []
body = {"query": {"ids": {"values": [story_hash]}}}
try:
results = cls.ES().search(body=body, index=cls.index_name(), doc_type=cls.doc_type())
except elasticsearch.exceptions.RequestError as e:
logging.debug(" ***> ~FRNo search server available for querying: %s" % e)
return []
# logging.debug(f"Results: {results}")
if len(results["hits"]["hits"]) == 0:
logging.debug(f" ---> ~FRNo content vector found for story {story_hash}")
return []
return results["hits"]["hits"][0]["_source"]["content_vector"]
@classmethod
def generate_combined_story_content_vector(cls, story_hashes):
vectors = []
for story_hash in story_hashes:
vector = cls.fetch_story_content_vector(story_hash)
if not vector:
vector = cls.generate_story_content_vector(story_hash)
vectors.append(vector)
combined_vector = np.mean(vectors, axis=0)
normalized_combined_vector = combined_vector / np.linalg.norm(combined_vector)
return normalized_combined_vector
@classmethod
def generate_story_content_vector(cls, story_hash):
from apps.rss_feeds.models import MStory