mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-31 21:41:33 +00:00
Adding API endpoints for discover stories. Need to finish the UI for stories. Then prob add premium limits.
This commit is contained in:
parent
a2d3785c7c
commit
d04b4b038a
4 changed files with 120 additions and 3 deletions
|
@ -2508,11 +2508,11 @@ class Feed(models.Model):
|
|||
|
||||
seq = difflib.SequenceMatcher(None, story_content, existing_story_content)
|
||||
|
||||
similiar_length_min = 1000
|
||||
similar_length_min = 1000
|
||||
if existing_story.story_permalink == story_link and existing_story.story_title == story.get(
|
||||
"title"
|
||||
):
|
||||
similiar_length_min = 20
|
||||
similar_length_min = 20
|
||||
|
||||
# Skip content check if already failed due to a timeout. This way we catch titles
|
||||
if lightweight:
|
||||
|
@ -2521,7 +2521,7 @@ class Feed(models.Model):
|
|||
if (
|
||||
seq
|
||||
and story_content
|
||||
and len(story_content) > similiar_length_min
|
||||
and len(story_content) > similar_length_min
|
||||
and existing_story_content
|
||||
and seq.real_quick_ratio() > 0.9
|
||||
and seq.quick_ratio() > 0.95
|
||||
|
@ -3466,6 +3466,17 @@ class MStory(mongo.Document):
|
|||
|
||||
return original_page
|
||||
|
||||
def fetch_similar_stories(self, feed_ids=None, offset=0, limit=5):
|
||||
combined_content_vector = SearchStory.generate_combined_story_content_vector([self.story_hash])
|
||||
results = SearchStory.vector_query(
|
||||
combined_content_vector, feed_ids_to_include=feed_ids, offset=offset, max_results=limit
|
||||
)
|
||||
logging.debug(
|
||||
f"Found {len(results)} recommendations for stories related to {self}: {[r['_source']['title'] for r in results]}"
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class MStarredStory(mongo.DynamicDocument):
|
||||
"""Like MStory, but not inherited due to large overhead of _cls and _type in
|
||||
|
|
|
@ -28,4 +28,5 @@ urlpatterns = [
|
|||
url(r"^story_changes", views.story_changes, name="story-changes"),
|
||||
url(r"^discover/(?P<feed_id>\d+)/?$", views.discover_feeds, name="discover-feed"),
|
||||
url(r"^discover/feeds/?$", views.discover_feeds, name="discover-feeds"),
|
||||
url(r"^discover/stories/(?P<story_hash>.+)/?$", views.discover_stories, name="discover-stories"),
|
||||
]
|
||||
|
|
|
@ -676,3 +676,21 @@ def discover_feeds(request, feed_id=None):
|
|||
|
||||
logging.user(request, "~FCDiscovering similar feeds, page %s: ~SB%s" % (page, similar_feed_ids))
|
||||
return {"discover_feeds": discover_feeds}
|
||||
|
||||
|
||||
@ajax_login_required
|
||||
@json.json_view
|
||||
def discover_stories(request, story_hash):
|
||||
page = int(request.GET.get("page") or request.POST.get("page") or 1)
|
||||
feed_ids = request.GET.getlist("feed_ids") or request.POST.getlist("feed_ids")
|
||||
limit = 5
|
||||
offset = (page - 1) * limit
|
||||
story, _ = MStory.find_story(story_hash=story_hash)
|
||||
if not story:
|
||||
return {"code": -1, "message": "Story not found.", "discover_stories": None, "failed": True}
|
||||
|
||||
similar_stories = story.fetch_similar_stories(feed_ids=feed_ids, offset=offset, limit=limit)
|
||||
similar_story_hashes = [result["_id"] for result in similar_stories]
|
||||
stories = MStory.objects.filter(story_hash__in=similar_story_hashes)
|
||||
stories = Feed.format_stories(stories)
|
||||
return {"discover_stories": stories}
|
||||
|
|
|
@ -504,6 +504,93 @@ class SearchStory:
|
|||
|
||||
return result_ids
|
||||
|
||||
@classmethod
|
||||
def vector_query(
|
||||
cls, query_vector, offset=0, max_results=10, feed_ids_to_include=None, feed_ids_to_exclude=None
|
||||
):
|
||||
try:
|
||||
cls.ES().indices.flush(index=cls.index_name())
|
||||
except elasticsearch.exceptions.NotFoundError as e:
|
||||
logging.debug(f" ***> ~FRNo search server available: {e}")
|
||||
return []
|
||||
|
||||
must_clauses = [
|
||||
{
|
||||
"script_score": {
|
||||
"query": {"match_all": {}},
|
||||
"script": {
|
||||
"source": "cosineSimilarity(params.query_vector, 'content_vector') + 1.0",
|
||||
"params": {"query_vector": query_vector},
|
||||
},
|
||||
}
|
||||
}
|
||||
]
|
||||
must_not_clauses = []
|
||||
if feed_ids_to_include:
|
||||
must_clauses.append({"terms": {"feed_id": feed_ids_to_include}})
|
||||
if feed_ids_to_exclude:
|
||||
must_not_clauses.append({"terms": {"feed_id": feed_ids_to_exclude}})
|
||||
|
||||
clauses = {}
|
||||
if must_clauses:
|
||||
clauses["must"] = must_clauses
|
||||
if must_not_clauses:
|
||||
clauses["must_not"] = must_not_clauses
|
||||
|
||||
body = {
|
||||
"query": {
|
||||
"bool": clauses,
|
||||
},
|
||||
"size": max_results,
|
||||
"from": offset,
|
||||
}
|
||||
try:
|
||||
results = cls.ES().search(body=body, index=cls.index_name(), doc_type=cls.doc_type())
|
||||
except elasticsearch.exceptions.RequestError as e:
|
||||
logging.debug(" ***> ~FRNo search server available for querying: %s" % e)
|
||||
return []
|
||||
|
||||
logging.info(
|
||||
f"~FGVector search ~FCstories~FG: ~SB{max_results}~SN requested{f'~SB offset {offset}~SN' if offset else ''}, ~SB{len(results['hits']['hits'])}~SN results"
|
||||
)
|
||||
|
||||
return results["hits"]["hits"]
|
||||
|
||||
@classmethod
|
||||
def fetch_story_content_vector(cls, story_hash):
|
||||
# Fetch the content vector from ES for the specified story_hash
|
||||
try:
|
||||
cls.ES().indices.flush(index=cls.index_name())
|
||||
except elasticsearch.exceptions.NotFoundError as e:
|
||||
logging.debug(f" ***> ~FRNo search server available: {e}")
|
||||
return []
|
||||
|
||||
body = {"query": {"ids": {"values": [story_hash]}}}
|
||||
try:
|
||||
results = cls.ES().search(body=body, index=cls.index_name(), doc_type=cls.doc_type())
|
||||
except elasticsearch.exceptions.RequestError as e:
|
||||
logging.debug(" ***> ~FRNo search server available for querying: %s" % e)
|
||||
return []
|
||||
# logging.debug(f"Results: {results}")
|
||||
if len(results["hits"]["hits"]) == 0:
|
||||
logging.debug(f" ---> ~FRNo content vector found for story {story_hash}")
|
||||
return []
|
||||
return results["hits"]["hits"][0]["_source"]["content_vector"]
|
||||
|
||||
@classmethod
|
||||
def generate_combined_story_content_vector(cls, story_hashes):
|
||||
vectors = []
|
||||
for story_hash in story_hashes:
|
||||
vector = cls.fetch_story_content_vector(story_hash)
|
||||
if not vector:
|
||||
vector = cls.generate_story_content_vector(story_hash)
|
||||
vectors.append(vector)
|
||||
|
||||
combined_vector = np.mean(vectors, axis=0)
|
||||
normalized_combined_vector = combined_vector / np.linalg.norm(combined_vector)
|
||||
|
||||
return normalized_combined_vector
|
||||
|
||||
@classmethod
|
||||
def generate_story_content_vector(cls, story_hash):
|
||||
from apps.rss_feeds.models import MStory
|
||||
|
|
Loading…
Add table
Reference in a new issue