Rewriting search to use Elasticsearch across feeds. Much faster, just needs indexing.

2025-04-13 09:42:01 +00:00 · 2014-04-15 14:17:15 -07:00 · 2014-04-15 14:17:15 -07:00 · 3b81c374d4
commit 3b81c374d4
parent d19bf1b641
5 changed files with 59 additions and 50 deletions
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -40,6 +40,7 @@ from utils.feed_functions import timelimit, TimeoutError
 from utils.feed_functions import relative_timesince
 from utils.feed_functions import seconds_timesince
 from utils.story_functions import strip_tags, htmldiff, strip_comments, strip_comments__lxml
+from utils.story_functions import prep_for_search

 ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)

@ -1179,25 +1180,34 @@ class Feed(models.Model):
    
    @classmethod
    def find_feed_stories(cls, feed_ids, query, offset=0, limit=25):
+        story_ids = SearchStory.query(feed_ids=feed_ids, query=query)
        stories_db = MStory.objects(
-            Q(story_feed_id__in=feed_ids) &
-            (Q(story_title__icontains=query) |
-             Q(story_author_name__icontains=query) |
-             Q(story_tags__icontains=query))
+            story_hash__in=story_ids
        ).order_by('-story_date')[offset:offset+limit]
+
+        # stories_db = MStory.objects(
+        #     Q(story_feed_id__in=feed_ids) &
+        #     (Q(story_title__icontains=query) |
+        #      Q(story_author_name__icontains=query) |
+        #      Q(story_tags__icontains=query))
+        # ).order_by('-story_date')[offset:offset+limit]
        stories = cls.format_stories(stories_db)
        
        return stories
        
    def find_stories(self, query, offset=0, limit=25):
-        SearchStory.query(feed_ids=[self.pk], query=query)
-
+        story_ids = SearchStory.query(feed_ids=[self.pk], query=query)
        stories_db = MStory.objects(
-            Q(story_feed_id=self.pk) &
-            (Q(story_title__icontains=query) |
-             Q(story_author_name__icontains=query) |
-             Q(story_tags__icontains=query))
+            story_hash__in=story_ids
        ).order_by('-story_date')[offset:offset+limit]
+        
+        # stories_db = MStory.objects(
+        #     Q(story_feed_id=self.pk) &
+        #     (Q(story_title__icontains=query) |
+        #      Q(story_author_name__icontains=query) |
+        #      Q(story_tags__icontains=query))
+        # ).order_by('-story_date')[offset:offset+limit]
+        
        stories = self.format_stories(stories_db, self.pk)
        
        return stories
@ -1722,7 +1732,8 @@ class MStory(mongo.Document):

    @classmethod
    def index_all_for_search(cls, offset=0):
-        SearchStory.create_elasticsearch_mapping()
+        if not offset:
+            SearchStory.create_elasticsearch_mapping()
        
        last_pk = Feed.objects.latest('pk').pk
        for f in xrange(offset, last_pk, 1000):
@ -1741,8 +1752,9 @@ class MStory(mongo.Document):
        story_content = zlib.decompress(self.story_content_z)
        SearchStory.index(story_hash=self.story_hash, 
                          story_title=self.story_title, 
-                          story_content=story_content, 
+                          story_content=prep_for_search(story_content), 
                          story_author=self.story_author_name, 
+                          story_feed_id=self.story_feed_id, 
                          story_date=self.story_date)
    
    @classmethod
--- a/apps/search/models.py
+++ b/apps/search/models.py
@ -1,5 +1,5 @@
 import pyes
-from pyes.query import FuzzyQuery, MatchQuery
+from pyes.query import MatchQuery
 from django.conf import settings
 from utils import log as logging

@ -24,46 +24,43 @@ class SearchStory:
            'title': {
                'boost': 2.0,
                'index': 'analyzed',
-                'store': 'yes',
+                'store': 'no',
                'type': 'string',
-                "term_vector" : "with_positions_offsets"
+                'analyzer': 'snowball',
            },
            'content': {
                'boost': 1.0,
                'index': 'analyzed',
-                'store': 'yes',
+                'store': 'no',
                'type': 'string',
-                "term_vector" : "with_positions_offsets"
+                'analyzer': 'snowball',
            },
            'author': {
                'boost': 1.0,
                'index': 'analyzed',
-                'store': 'yes',
+                'store': 'no',
                'type': 'string',   
-            },
-            'story_hash': {
-                'index': 'not_analyzed',
-                'store': 'yes',
-                'type': 'string',
+                'analyzer': 'keyword',
            },
            'feed_id': {
-                'store': 'yes',
+                'store': 'no',
                'type': 'integer'
            },
            'date': {
-                'store': 'yes',
+                'store': 'no',
                'type': 'date',
            }
        }
        cls.ES.indices.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
        
    @classmethod
-    def index(cls, story_hash, story_title, story_content, story_author, story_date):
+    def index(cls, story_hash, story_title, story_content, story_author, story_feed_id, 
+              story_date):
        doc = {
-            "story_hash": story_hash,
            "content": story_content,
            "title": story_title,
            "author": story_author,
+            "feed_id": story_feed_id,
            "date": story_date,
        }
        cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, story_hash)
@ -71,26 +68,15 @@ class SearchStory:
    @classmethod
    def query(cls, feed_ids, query):
        cls.ES.indices.refresh()
-        q = pyes.query.StringQuery(query)
-        results = cls.ES.search(q, indices=cls.index_name, doc_types=[cls.type_name])
-        logging.info("~FGSearch ~FCstories~FG for: ~SB%s" % query)
-        
-        if not results.total:
-            logging.info("~FGSearch ~FCstories~FG by title: ~SB%s" % query)
-            q = FuzzyQuery('title', query)
-            results = cls.ES.search(q)
-            
-        if not results.total:
-            logging.info("~FGSearch ~FCstories~FG by content: ~SB%s" % query)
-            q = FuzzyQuery('content', query)
-            results = cls.ES.search(q)
-            
-        if not results.total:
-            logging.info("~FGSearch ~FCstories~FG by author: ~SB%s" % query)
-            q = FuzzyQuery('author', query)
-            results = cls.ES.search(q)
-            
-        return results
+
+        string_q = pyes.query.StringQuery(query, default_operator="AND")
+        feed_q = pyes.query.TermsQuery('feed_id', feed_ids)
+        q = pyes.query.BoolQuery(must=[string_q, feed_q])
+        results = cls.ES.search(q, indices=cls.index_name(), doc_types=[cls.type_name()])
+        logging.info("~FGSearch ~FCstories~FG for: ~SB%s (across %s feed%s)" % 
+                     (query, len(feed_ids), 's' if len(feed_ids) != 1 else ''))
+
+        return [r.get_id() for r in results]


 class SearchFeed:
--- a/media/js/newsblur/views/folder_view.js
+++ b/media/js/newsblur/views/folder_view.js
@ -165,8 +165,6 @@ NEWSBLUR.Views.Folder = Backbone.View.extend({
        
        if (this.options.feedbar) {
            this.show_collapsed_folder_count();
-        }
-        if (this.options.feedbar && NEWSBLUR.Globals.is_staff) {
            this.search_view = new NEWSBLUR.Views.FeedSearchView({
                feedbar_view: this
            }).render();
--- a/media/js/newsblur/views/story_titles_header_view.js
+++ b/media/js/newsblur/views/story_titles_header_view.js
@ -51,6 +51,7 @@ NEWSBLUR.Views.StoryTitlesHeader = Backbone.View.extend({
        } else if (this.showing_fake_folder) {
            $view = $(_.template('\
                <div class="NB-folder NB-no-hover">\
+                    <div class="NB-search-container"></div>\
                    <% if (show_options) { %>\
                        <div class="NB-feedbar-options-container">\
                            <span class="NB-feedbar-options">\
@ -75,6 +76,11 @@ NEWSBLUR.Views.StoryTitlesHeader = Backbone.View.extend({
                show_options: !NEWSBLUR.reader.active_folder.get('fake') ||
                              NEWSBLUR.reader.active_folder.get('show_options')
            }));
+            this.search_view = new NEWSBLUR.Views.FeedSearchView({
+                feedbar_view: this
+            }).render();
+            this.search_view.blur_search();
+            $(".NB-search-container", $view).html(this.search_view.$el);
        } else if (NEWSBLUR.reader.flags['river_view'] && 
                   NEWSBLUR.reader.active_folder &&
                   NEWSBLUR.reader.active_folder.get('folder_title')) {
--- a/utils/story_functions.py
+++ b/utils/story_functions.py
@ -242,7 +242,14 @@ def strip_comments__lxml(html_string=""):
        return lxml.etree.tostring(clean_html)
    except (XMLSyntaxError, ParserError):
        return html_string
-        
+
+def prep_for_search(html):
+    html = strip_tags_django(html)
+    html = html.lower()
+    html = xhtml_unescape_tornado(html)
+    
+    return html[:100000]
+    
 def linkify(*args, **kwargs):
    return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))