Rewriting search to use Elasticsearch across feeds. Much faster, just needs indexing.

2025-08-05 16:58:59 +00:00 · 2014-04-15 14:17:15 -07:00 · 2014-04-15 14:17:15 -07:00 · 3b81c374d4
commit 3b81c374d4
parent d19bf1b641
5 changed files with 59 additions and 50 deletions
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -40,6 +40,7 @@ from utils.feed_functions import timelimit, TimeoutError
 from utils.feed_functions import relative_timesince
 from utils.feed_functions import seconds_timesince
 from utils.story_functions import strip_tags, htmldiff, strip_comments, strip_comments__lxml
 from utils.story_functions import prep_for_search
 ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
@ -1179,25 +1180,34 @@ class Feed(models.Model):
    @classmethod
    def find_feed_stories(cls, feed_ids, query, offset=0, limit=25):
        story_ids = SearchStory.query(feed_ids=feed_ids, query=query)
        stories_db = MStory.objects(
-            Q(story_feed_id__in=feed_ids) &
+            story_hash__in=story_ids
            (Q(story_title__icontains=query) |
             Q(story_author_name__icontains=query) |
             Q(story_tags__icontains=query))
        ).order_by('-story_date')[offset:offset+limit]
        # stories_db = MStory.objects(
        #     Q(story_feed_id__in=feed_ids) &
        #     (Q(story_title__icontains=query) |
        #      Q(story_author_name__icontains=query) |
        #      Q(story_tags__icontains=query))
        # ).order_by('-story_date')[offset:offset+limit]
        stories = cls.format_stories(stories_db)
        return stories
    def find_stories(self, query, offset=0, limit=25):
-        SearchStory.query(feed_ids=[self.pk], query=query)
+        story_ids = SearchStory.query(feed_ids=[self.pk], query=query)
        stories_db = MStory.objects(
-            Q(story_feed_id=self.pk) &
+            story_hash__in=story_ids
            (Q(story_title__icontains=query) |
             Q(story_author_name__icontains=query) |
             Q(story_tags__icontains=query))
        ).order_by('-story_date')[offset:offset+limit]
        # stories_db = MStory.objects(
        #     Q(story_feed_id=self.pk) &
        #     (Q(story_title__icontains=query) |
        #      Q(story_author_name__icontains=query) |
        #      Q(story_tags__icontains=query))
        # ).order_by('-story_date')[offset:offset+limit]
        stories = self.format_stories(stories_db, self.pk)
        return stories
@ -1722,6 +1732,7 @@ class MStory(mongo.Document):
    @classmethod
    def index_all_for_search(cls, offset=0):
        if not offset:
            SearchStory.create_elasticsearch_mapping()
        last_pk = Feed.objects.latest('pk').pk
@ -1741,8 +1752,9 @@ class MStory(mongo.Document):
        story_content = zlib.decompress(self.story_content_z)
        SearchStory.index(story_hash=self.story_hash, 
                          story_title=self.story_title, 
-                          story_content=story_content, 
+                          story_content=prep_for_search(story_content), 
                          story_author=self.story_author_name, 
                          story_feed_id=self.story_feed_id, 
                          story_date=self.story_date)
    @classmethod
--- a/apps/search/models.py
+++ b/apps/search/models.py
@ -1,5 +1,5 @@
 import pyes
-from pyes.query import FuzzyQuery, MatchQuery
+from pyes.query import MatchQuery
 from django.conf import settings
 from utils import log as logging
@ -24,46 +24,43 @@ class SearchStory:
            'title': {
                'boost': 2.0,
                'index': 'analyzed',
-                'store': 'yes',
+                'store': 'no',
                'type': 'string',
-                "term_vector" : "with_positions_offsets"
+                'analyzer': 'snowball',
            },
            'content': {
                'boost': 1.0,
                'index': 'analyzed',
-                'store': 'yes',
+                'store': 'no',
                'type': 'string',
-                "term_vector" : "with_positions_offsets"
+                'analyzer': 'snowball',
            },
            'author': {
                'boost': 1.0,
                'index': 'analyzed',
-                'store': 'yes',
+                'store': 'no',
                'type': 'string',   
            },
            'story_hash': {
                'index': 'not_analyzed',
                'store': 'yes',
                'type': 'string',   
                'analyzer': 'keyword',
            },
            'feed_id': {
-                'store': 'yes',
+                'store': 'no',
                'type': 'integer'
            },
            'date': {
-                'store': 'yes',
+                'store': 'no',
                'type': 'date',
            }
        }
        cls.ES.indices.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
    @classmethod
-    def index(cls, story_hash, story_title, story_content, story_author, story_date):
+    def index(cls, story_hash, story_title, story_content, story_author, story_feed_id, 
              story_date):
        doc = {
            "story_hash": story_hash,
            "content": story_content,
            "title": story_title,
            "author": story_author,
            "feed_id": story_feed_id,
            "date": story_date,
        }
        cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, story_hash)
@ -71,26 +68,15 @@ class SearchStory:
    @classmethod
    def query(cls, feed_ids, query):
        cls.ES.indices.refresh()
        q = pyes.query.StringQuery(query)
        results = cls.ES.search(q, indices=cls.index_name, doc_types=[cls.type_name])
        logging.info("~FGSearch ~FCstories~FG for: ~SB%s" % query)
-        if not results.total:
+        string_q = pyes.query.StringQuery(query, default_operator="AND")
-            logging.info("~FGSearch ~FCstories~FG by title: ~SB%s" % query)
+        feed_q = pyes.query.TermsQuery('feed_id', feed_ids)
-            q = FuzzyQuery('title', query)
+        q = pyes.query.BoolQuery(must=[string_q, feed_q])
-            results = cls.ES.search(q)
+        results = cls.ES.search(q, indices=cls.index_name(), doc_types=[cls.type_name()])
        logging.info("~FGSearch ~FCstories~FG for: ~SB%s (across %s feed%s)" % 
                     (query, len(feed_ids), 's' if len(feed_ids) != 1 else ''))
-        if not results.total:
+        return [r.get_id() for r in results]
            logging.info("~FGSearch ~FCstories~FG by content: ~SB%s" % query)
            q = FuzzyQuery('content', query)
            results = cls.ES.search(q)
        if not results.total:
            logging.info("~FGSearch ~FCstories~FG by author: ~SB%s" % query)
            q = FuzzyQuery('author', query)
            results = cls.ES.search(q)
        return results
 class SearchFeed:
--- a/media/js/newsblur/views/folder_view.js
+++ b/media/js/newsblur/views/folder_view.js
@ -165,8 +165,6 @@ NEWSBLUR.Views.Folder = Backbone.View.extend({
        if (this.options.feedbar) {
            this.show_collapsed_folder_count();
        }
        if (this.options.feedbar && NEWSBLUR.Globals.is_staff) {
            this.search_view = new NEWSBLUR.Views.FeedSearchView({
                feedbar_view: this
            }).render();
--- a/media/js/newsblur/views/story_titles_header_view.js
+++ b/media/js/newsblur/views/story_titles_header_view.js
@ -51,6 +51,7 @@ NEWSBLUR.Views.StoryTitlesHeader = Backbone.View.extend({
        } else if (this.showing_fake_folder) {
            $view = $(_.template('\
                <div class="NB-folder NB-no-hover">\
                    <div class="NB-search-container"></div>\
                    <% if (show_options) { %>\
                        <div class="NB-feedbar-options-container">\
                            <span class="NB-feedbar-options">\
@ -75,6 +76,11 @@ NEWSBLUR.Views.StoryTitlesHeader = Backbone.View.extend({
                show_options: !NEWSBLUR.reader.active_folder.get('fake') ||
                              NEWSBLUR.reader.active_folder.get('show_options')
            }));
            this.search_view = new NEWSBLUR.Views.FeedSearchView({
                feedbar_view: this
            }).render();
            this.search_view.blur_search();
            $(".NB-search-container", $view).html(this.search_view.$el);
        } else if (NEWSBLUR.reader.flags['river_view'] && 
                   NEWSBLUR.reader.active_folder &&
                   NEWSBLUR.reader.active_folder.get('folder_title')) {
--- a/utils/story_functions.py
+++ b/utils/story_functions.py
@ -243,6 +243,13 @@ def strip_comments__lxml(html_string=""):
    except (XMLSyntaxError, ParserError):
        return html_string
 def prep_for_search(html):
    html = strip_tags_django(html)
    html = html.lower()
    html = xhtml_unescape_tornado(html)
    return html[:100000]
 def linkify(*args, **kwargs):
    return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))