Rewriting search to use Elasticsearch across feeds. Much faster, just needs indexing.

This commit is contained in:
Samuel Clay 2014-04-15 14:17:15 -07:00
parent d19bf1b641
commit 3b81c374d4
5 changed files with 59 additions and 50 deletions

View file

@ -40,6 +40,7 @@ from utils.feed_functions import timelimit, TimeoutError
from utils.feed_functions import relative_timesince
from utils.feed_functions import seconds_timesince
from utils.story_functions import strip_tags, htmldiff, strip_comments, strip_comments__lxml
from utils.story_functions import prep_for_search
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
@ -1179,25 +1180,34 @@ class Feed(models.Model):
@classmethod
def find_feed_stories(cls, feed_ids, query, offset=0, limit=25):
story_ids = SearchStory.query(feed_ids=feed_ids, query=query)
stories_db = MStory.objects(
Q(story_feed_id__in=feed_ids) &
(Q(story_title__icontains=query) |
Q(story_author_name__icontains=query) |
Q(story_tags__icontains=query))
story_hash__in=story_ids
).order_by('-story_date')[offset:offset+limit]
# stories_db = MStory.objects(
# Q(story_feed_id__in=feed_ids) &
# (Q(story_title__icontains=query) |
# Q(story_author_name__icontains=query) |
# Q(story_tags__icontains=query))
# ).order_by('-story_date')[offset:offset+limit]
stories = cls.format_stories(stories_db)
return stories
def find_stories(self, query, offset=0, limit=25):
SearchStory.query(feed_ids=[self.pk], query=query)
story_ids = SearchStory.query(feed_ids=[self.pk], query=query)
stories_db = MStory.objects(
Q(story_feed_id=self.pk) &
(Q(story_title__icontains=query) |
Q(story_author_name__icontains=query) |
Q(story_tags__icontains=query))
story_hash__in=story_ids
).order_by('-story_date')[offset:offset+limit]
# stories_db = MStory.objects(
# Q(story_feed_id=self.pk) &
# (Q(story_title__icontains=query) |
# Q(story_author_name__icontains=query) |
# Q(story_tags__icontains=query))
# ).order_by('-story_date')[offset:offset+limit]
stories = self.format_stories(stories_db, self.pk)
return stories
@ -1722,7 +1732,8 @@ class MStory(mongo.Document):
@classmethod
def index_all_for_search(cls, offset=0):
SearchStory.create_elasticsearch_mapping()
if not offset:
SearchStory.create_elasticsearch_mapping()
last_pk = Feed.objects.latest('pk').pk
for f in xrange(offset, last_pk, 1000):
@ -1741,8 +1752,9 @@ class MStory(mongo.Document):
story_content = zlib.decompress(self.story_content_z)
SearchStory.index(story_hash=self.story_hash,
story_title=self.story_title,
story_content=story_content,
story_content=prep_for_search(story_content),
story_author=self.story_author_name,
story_feed_id=self.story_feed_id,
story_date=self.story_date)
@classmethod

View file

@ -1,5 +1,5 @@
import pyes
from pyes.query import FuzzyQuery, MatchQuery
from pyes.query import MatchQuery
from django.conf import settings
from utils import log as logging
@ -24,46 +24,43 @@ class SearchStory:
'title': {
'boost': 2.0,
'index': 'analyzed',
'store': 'yes',
'store': 'no',
'type': 'string',
"term_vector" : "with_positions_offsets"
'analyzer': 'snowball',
},
'content': {
'boost': 1.0,
'index': 'analyzed',
'store': 'yes',
'store': 'no',
'type': 'string',
"term_vector" : "with_positions_offsets"
'analyzer': 'snowball',
},
'author': {
'boost': 1.0,
'index': 'analyzed',
'store': 'yes',
'store': 'no',
'type': 'string',
},
'story_hash': {
'index': 'not_analyzed',
'store': 'yes',
'type': 'string',
'analyzer': 'keyword',
},
'feed_id': {
'store': 'yes',
'store': 'no',
'type': 'integer'
},
'date': {
'store': 'yes',
'store': 'no',
'type': 'date',
}
}
cls.ES.indices.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
@classmethod
def index(cls, story_hash, story_title, story_content, story_author, story_date):
def index(cls, story_hash, story_title, story_content, story_author, story_feed_id,
story_date):
doc = {
"story_hash": story_hash,
"content": story_content,
"title": story_title,
"author": story_author,
"feed_id": story_feed_id,
"date": story_date,
}
cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, story_hash)
@ -71,26 +68,15 @@ class SearchStory:
@classmethod
def query(cls, feed_ids, query):
cls.ES.indices.refresh()
q = pyes.query.StringQuery(query)
results = cls.ES.search(q, indices=cls.index_name, doc_types=[cls.type_name])
logging.info("~FGSearch ~FCstories~FG for: ~SB%s" % query)
if not results.total:
logging.info("~FGSearch ~FCstories~FG by title: ~SB%s" % query)
q = FuzzyQuery('title', query)
results = cls.ES.search(q)
if not results.total:
logging.info("~FGSearch ~FCstories~FG by content: ~SB%s" % query)
q = FuzzyQuery('content', query)
results = cls.ES.search(q)
if not results.total:
logging.info("~FGSearch ~FCstories~FG by author: ~SB%s" % query)
q = FuzzyQuery('author', query)
results = cls.ES.search(q)
return results
string_q = pyes.query.StringQuery(query, default_operator="AND")
feed_q = pyes.query.TermsQuery('feed_id', feed_ids)
q = pyes.query.BoolQuery(must=[string_q, feed_q])
results = cls.ES.search(q, indices=cls.index_name(), doc_types=[cls.type_name()])
logging.info("~FGSearch ~FCstories~FG for: ~SB%s (across %s feed%s)" %
(query, len(feed_ids), 's' if len(feed_ids) != 1 else ''))
return [r.get_id() for r in results]
class SearchFeed:

View file

@ -165,8 +165,6 @@ NEWSBLUR.Views.Folder = Backbone.View.extend({
if (this.options.feedbar) {
this.show_collapsed_folder_count();
}
if (this.options.feedbar && NEWSBLUR.Globals.is_staff) {
this.search_view = new NEWSBLUR.Views.FeedSearchView({
feedbar_view: this
}).render();

View file

@ -51,6 +51,7 @@ NEWSBLUR.Views.StoryTitlesHeader = Backbone.View.extend({
} else if (this.showing_fake_folder) {
$view = $(_.template('\
<div class="NB-folder NB-no-hover">\
<div class="NB-search-container"></div>\
<% if (show_options) { %>\
<div class="NB-feedbar-options-container">\
<span class="NB-feedbar-options">\
@ -75,6 +76,11 @@ NEWSBLUR.Views.StoryTitlesHeader = Backbone.View.extend({
show_options: !NEWSBLUR.reader.active_folder.get('fake') ||
NEWSBLUR.reader.active_folder.get('show_options')
}));
this.search_view = new NEWSBLUR.Views.FeedSearchView({
feedbar_view: this
}).render();
this.search_view.blur_search();
$(".NB-search-container", $view).html(this.search_view.$el);
} else if (NEWSBLUR.reader.flags['river_view'] &&
NEWSBLUR.reader.active_folder &&
NEWSBLUR.reader.active_folder.get('folder_title')) {

View file

@ -242,7 +242,14 @@ def strip_comments__lxml(html_string=""):
return lxml.etree.tostring(clean_html)
except (XMLSyntaxError, ParserError):
return html_string
def prep_for_search(html):
html = strip_tags_django(html)
html = html.lower()
html = xhtml_unescape_tornado(html)
return html[:100000]
def linkify(*args, **kwargs):
return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))