Rewriting search to use Elasticsearch across feeds. Much faster, just needs indexing.

This commit is contained in:
Samuel Clay 2014-04-15 14:17:15 -07:00
parent d19bf1b641
commit 3b81c374d4
5 changed files with 59 additions and 50 deletions

View file

@ -40,6 +40,7 @@ from utils.feed_functions import timelimit, TimeoutError
from utils.feed_functions import relative_timesince from utils.feed_functions import relative_timesince
from utils.feed_functions import seconds_timesince from utils.feed_functions import seconds_timesince
from utils.story_functions import strip_tags, htmldiff, strip_comments, strip_comments__lxml from utils.story_functions import strip_tags, htmldiff, strip_comments, strip_comments__lxml
from utils.story_functions import prep_for_search
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4) ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
@ -1179,25 +1180,34 @@ class Feed(models.Model):
@classmethod @classmethod
def find_feed_stories(cls, feed_ids, query, offset=0, limit=25): def find_feed_stories(cls, feed_ids, query, offset=0, limit=25):
story_ids = SearchStory.query(feed_ids=feed_ids, query=query)
stories_db = MStory.objects( stories_db = MStory.objects(
Q(story_feed_id__in=feed_ids) & story_hash__in=story_ids
(Q(story_title__icontains=query) |
Q(story_author_name__icontains=query) |
Q(story_tags__icontains=query))
).order_by('-story_date')[offset:offset+limit] ).order_by('-story_date')[offset:offset+limit]
# stories_db = MStory.objects(
# Q(story_feed_id__in=feed_ids) &
# (Q(story_title__icontains=query) |
# Q(story_author_name__icontains=query) |
# Q(story_tags__icontains=query))
# ).order_by('-story_date')[offset:offset+limit]
stories = cls.format_stories(stories_db) stories = cls.format_stories(stories_db)
return stories return stories
def find_stories(self, query, offset=0, limit=25): def find_stories(self, query, offset=0, limit=25):
SearchStory.query(feed_ids=[self.pk], query=query) story_ids = SearchStory.query(feed_ids=[self.pk], query=query)
stories_db = MStory.objects( stories_db = MStory.objects(
Q(story_feed_id=self.pk) & story_hash__in=story_ids
(Q(story_title__icontains=query) |
Q(story_author_name__icontains=query) |
Q(story_tags__icontains=query))
).order_by('-story_date')[offset:offset+limit] ).order_by('-story_date')[offset:offset+limit]
# stories_db = MStory.objects(
# Q(story_feed_id=self.pk) &
# (Q(story_title__icontains=query) |
# Q(story_author_name__icontains=query) |
# Q(story_tags__icontains=query))
# ).order_by('-story_date')[offset:offset+limit]
stories = self.format_stories(stories_db, self.pk) stories = self.format_stories(stories_db, self.pk)
return stories return stories
@ -1722,6 +1732,7 @@ class MStory(mongo.Document):
@classmethod @classmethod
def index_all_for_search(cls, offset=0): def index_all_for_search(cls, offset=0):
if not offset:
SearchStory.create_elasticsearch_mapping() SearchStory.create_elasticsearch_mapping()
last_pk = Feed.objects.latest('pk').pk last_pk = Feed.objects.latest('pk').pk
@ -1741,8 +1752,9 @@ class MStory(mongo.Document):
story_content = zlib.decompress(self.story_content_z) story_content = zlib.decompress(self.story_content_z)
SearchStory.index(story_hash=self.story_hash, SearchStory.index(story_hash=self.story_hash,
story_title=self.story_title, story_title=self.story_title,
story_content=story_content, story_content=prep_for_search(story_content),
story_author=self.story_author_name, story_author=self.story_author_name,
story_feed_id=self.story_feed_id,
story_date=self.story_date) story_date=self.story_date)
@classmethod @classmethod

View file

@ -1,5 +1,5 @@
import pyes import pyes
from pyes.query import FuzzyQuery, MatchQuery from pyes.query import MatchQuery
from django.conf import settings from django.conf import settings
from utils import log as logging from utils import log as logging
@ -24,46 +24,43 @@ class SearchStory:
'title': { 'title': {
'boost': 2.0, 'boost': 2.0,
'index': 'analyzed', 'index': 'analyzed',
'store': 'yes', 'store': 'no',
'type': 'string', 'type': 'string',
"term_vector" : "with_positions_offsets" 'analyzer': 'snowball',
}, },
'content': { 'content': {
'boost': 1.0, 'boost': 1.0,
'index': 'analyzed', 'index': 'analyzed',
'store': 'yes', 'store': 'no',
'type': 'string', 'type': 'string',
"term_vector" : "with_positions_offsets" 'analyzer': 'snowball',
}, },
'author': { 'author': {
'boost': 1.0, 'boost': 1.0,
'index': 'analyzed', 'index': 'analyzed',
'store': 'yes', 'store': 'no',
'type': 'string',
},
'story_hash': {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string', 'type': 'string',
'analyzer': 'keyword',
}, },
'feed_id': { 'feed_id': {
'store': 'yes', 'store': 'no',
'type': 'integer' 'type': 'integer'
}, },
'date': { 'date': {
'store': 'yes', 'store': 'no',
'type': 'date', 'type': 'date',
} }
} }
cls.ES.indices.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name]) cls.ES.indices.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
@classmethod @classmethod
def index(cls, story_hash, story_title, story_content, story_author, story_date): def index(cls, story_hash, story_title, story_content, story_author, story_feed_id,
story_date):
doc = { doc = {
"story_hash": story_hash,
"content": story_content, "content": story_content,
"title": story_title, "title": story_title,
"author": story_author, "author": story_author,
"feed_id": story_feed_id,
"date": story_date, "date": story_date,
} }
cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, story_hash) cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, story_hash)
@ -71,26 +68,15 @@ class SearchStory:
@classmethod @classmethod
def query(cls, feed_ids, query): def query(cls, feed_ids, query):
cls.ES.indices.refresh() cls.ES.indices.refresh()
q = pyes.query.StringQuery(query)
results = cls.ES.search(q, indices=cls.index_name, doc_types=[cls.type_name])
logging.info("~FGSearch ~FCstories~FG for: ~SB%s" % query)
if not results.total: string_q = pyes.query.StringQuery(query, default_operator="AND")
logging.info("~FGSearch ~FCstories~FG by title: ~SB%s" % query) feed_q = pyes.query.TermsQuery('feed_id', feed_ids)
q = FuzzyQuery('title', query) q = pyes.query.BoolQuery(must=[string_q, feed_q])
results = cls.ES.search(q) results = cls.ES.search(q, indices=cls.index_name(), doc_types=[cls.type_name()])
logging.info("~FGSearch ~FCstories~FG for: ~SB%s (across %s feed%s)" %
(query, len(feed_ids), 's' if len(feed_ids) != 1 else ''))
if not results.total: return [r.get_id() for r in results]
logging.info("~FGSearch ~FCstories~FG by content: ~SB%s" % query)
q = FuzzyQuery('content', query)
results = cls.ES.search(q)
if not results.total:
logging.info("~FGSearch ~FCstories~FG by author: ~SB%s" % query)
q = FuzzyQuery('author', query)
results = cls.ES.search(q)
return results
class SearchFeed: class SearchFeed:

View file

@ -165,8 +165,6 @@ NEWSBLUR.Views.Folder = Backbone.View.extend({
if (this.options.feedbar) { if (this.options.feedbar) {
this.show_collapsed_folder_count(); this.show_collapsed_folder_count();
}
if (this.options.feedbar && NEWSBLUR.Globals.is_staff) {
this.search_view = new NEWSBLUR.Views.FeedSearchView({ this.search_view = new NEWSBLUR.Views.FeedSearchView({
feedbar_view: this feedbar_view: this
}).render(); }).render();

View file

@ -51,6 +51,7 @@ NEWSBLUR.Views.StoryTitlesHeader = Backbone.View.extend({
} else if (this.showing_fake_folder) { } else if (this.showing_fake_folder) {
$view = $(_.template('\ $view = $(_.template('\
<div class="NB-folder NB-no-hover">\ <div class="NB-folder NB-no-hover">\
<div class="NB-search-container"></div>\
<% if (show_options) { %>\ <% if (show_options) { %>\
<div class="NB-feedbar-options-container">\ <div class="NB-feedbar-options-container">\
<span class="NB-feedbar-options">\ <span class="NB-feedbar-options">\
@ -75,6 +76,11 @@ NEWSBLUR.Views.StoryTitlesHeader = Backbone.View.extend({
show_options: !NEWSBLUR.reader.active_folder.get('fake') || show_options: !NEWSBLUR.reader.active_folder.get('fake') ||
NEWSBLUR.reader.active_folder.get('show_options') NEWSBLUR.reader.active_folder.get('show_options')
})); }));
this.search_view = new NEWSBLUR.Views.FeedSearchView({
feedbar_view: this
}).render();
this.search_view.blur_search();
$(".NB-search-container", $view).html(this.search_view.$el);
} else if (NEWSBLUR.reader.flags['river_view'] && } else if (NEWSBLUR.reader.flags['river_view'] &&
NEWSBLUR.reader.active_folder && NEWSBLUR.reader.active_folder &&
NEWSBLUR.reader.active_folder.get('folder_title')) { NEWSBLUR.reader.active_folder.get('folder_title')) {

View file

@ -243,6 +243,13 @@ def strip_comments__lxml(html_string=""):
except (XMLSyntaxError, ParserError): except (XMLSyntaxError, ParserError):
return html_string return html_string
def prep_for_search(html):
html = strip_tags_django(html)
html = html.lower()
html = xhtml_unescape_tornado(html)
return html[:100000]
def linkify(*args, **kwargs): def linkify(*args, **kwargs):
return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs)) return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))