mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-04-13 09:42:01 +00:00
Rewriting search to use Elasticsearch across feeds. Much faster, just needs indexing.
This commit is contained in:
parent
d19bf1b641
commit
3b81c374d4
5 changed files with 59 additions and 50 deletions
|
@ -40,6 +40,7 @@ from utils.feed_functions import timelimit, TimeoutError
|
|||
from utils.feed_functions import relative_timesince
|
||||
from utils.feed_functions import seconds_timesince
|
||||
from utils.story_functions import strip_tags, htmldiff, strip_comments, strip_comments__lxml
|
||||
from utils.story_functions import prep_for_search
|
||||
|
||||
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
||||
|
||||
|
@ -1179,25 +1180,34 @@ class Feed(models.Model):
|
|||
|
||||
@classmethod
|
||||
def find_feed_stories(cls, feed_ids, query, offset=0, limit=25):
|
||||
story_ids = SearchStory.query(feed_ids=feed_ids, query=query)
|
||||
stories_db = MStory.objects(
|
||||
Q(story_feed_id__in=feed_ids) &
|
||||
(Q(story_title__icontains=query) |
|
||||
Q(story_author_name__icontains=query) |
|
||||
Q(story_tags__icontains=query))
|
||||
story_hash__in=story_ids
|
||||
).order_by('-story_date')[offset:offset+limit]
|
||||
|
||||
# stories_db = MStory.objects(
|
||||
# Q(story_feed_id__in=feed_ids) &
|
||||
# (Q(story_title__icontains=query) |
|
||||
# Q(story_author_name__icontains=query) |
|
||||
# Q(story_tags__icontains=query))
|
||||
# ).order_by('-story_date')[offset:offset+limit]
|
||||
stories = cls.format_stories(stories_db)
|
||||
|
||||
return stories
|
||||
|
||||
def find_stories(self, query, offset=0, limit=25):
|
||||
SearchStory.query(feed_ids=[self.pk], query=query)
|
||||
|
||||
story_ids = SearchStory.query(feed_ids=[self.pk], query=query)
|
||||
stories_db = MStory.objects(
|
||||
Q(story_feed_id=self.pk) &
|
||||
(Q(story_title__icontains=query) |
|
||||
Q(story_author_name__icontains=query) |
|
||||
Q(story_tags__icontains=query))
|
||||
story_hash__in=story_ids
|
||||
).order_by('-story_date')[offset:offset+limit]
|
||||
|
||||
# stories_db = MStory.objects(
|
||||
# Q(story_feed_id=self.pk) &
|
||||
# (Q(story_title__icontains=query) |
|
||||
# Q(story_author_name__icontains=query) |
|
||||
# Q(story_tags__icontains=query))
|
||||
# ).order_by('-story_date')[offset:offset+limit]
|
||||
|
||||
stories = self.format_stories(stories_db, self.pk)
|
||||
|
||||
return stories
|
||||
|
@ -1722,7 +1732,8 @@ class MStory(mongo.Document):
|
|||
|
||||
@classmethod
|
||||
def index_all_for_search(cls, offset=0):
|
||||
SearchStory.create_elasticsearch_mapping()
|
||||
if not offset:
|
||||
SearchStory.create_elasticsearch_mapping()
|
||||
|
||||
last_pk = Feed.objects.latest('pk').pk
|
||||
for f in xrange(offset, last_pk, 1000):
|
||||
|
@ -1741,8 +1752,9 @@ class MStory(mongo.Document):
|
|||
story_content = zlib.decompress(self.story_content_z)
|
||||
SearchStory.index(story_hash=self.story_hash,
|
||||
story_title=self.story_title,
|
||||
story_content=story_content,
|
||||
story_content=prep_for_search(story_content),
|
||||
story_author=self.story_author_name,
|
||||
story_feed_id=self.story_feed_id,
|
||||
story_date=self.story_date)
|
||||
|
||||
@classmethod
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import pyes
|
||||
from pyes.query import FuzzyQuery, MatchQuery
|
||||
from pyes.query import MatchQuery
|
||||
from django.conf import settings
|
||||
from utils import log as logging
|
||||
|
||||
|
@ -24,46 +24,43 @@ class SearchStory:
|
|||
'title': {
|
||||
'boost': 2.0,
|
||||
'index': 'analyzed',
|
||||
'store': 'yes',
|
||||
'store': 'no',
|
||||
'type': 'string',
|
||||
"term_vector" : "with_positions_offsets"
|
||||
'analyzer': 'snowball',
|
||||
},
|
||||
'content': {
|
||||
'boost': 1.0,
|
||||
'index': 'analyzed',
|
||||
'store': 'yes',
|
||||
'store': 'no',
|
||||
'type': 'string',
|
||||
"term_vector" : "with_positions_offsets"
|
||||
'analyzer': 'snowball',
|
||||
},
|
||||
'author': {
|
||||
'boost': 1.0,
|
||||
'index': 'analyzed',
|
||||
'store': 'yes',
|
||||
'store': 'no',
|
||||
'type': 'string',
|
||||
},
|
||||
'story_hash': {
|
||||
'index': 'not_analyzed',
|
||||
'store': 'yes',
|
||||
'type': 'string',
|
||||
'analyzer': 'keyword',
|
||||
},
|
||||
'feed_id': {
|
||||
'store': 'yes',
|
||||
'store': 'no',
|
||||
'type': 'integer'
|
||||
},
|
||||
'date': {
|
||||
'store': 'yes',
|
||||
'store': 'no',
|
||||
'type': 'date',
|
||||
}
|
||||
}
|
||||
cls.ES.indices.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
|
||||
|
||||
@classmethod
|
||||
def index(cls, story_hash, story_title, story_content, story_author, story_date):
|
||||
def index(cls, story_hash, story_title, story_content, story_author, story_feed_id,
|
||||
story_date):
|
||||
doc = {
|
||||
"story_hash": story_hash,
|
||||
"content": story_content,
|
||||
"title": story_title,
|
||||
"author": story_author,
|
||||
"feed_id": story_feed_id,
|
||||
"date": story_date,
|
||||
}
|
||||
cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, story_hash)
|
||||
|
@ -71,26 +68,15 @@ class SearchStory:
|
|||
@classmethod
|
||||
def query(cls, feed_ids, query):
|
||||
cls.ES.indices.refresh()
|
||||
q = pyes.query.StringQuery(query)
|
||||
results = cls.ES.search(q, indices=cls.index_name, doc_types=[cls.type_name])
|
||||
logging.info("~FGSearch ~FCstories~FG for: ~SB%s" % query)
|
||||
|
||||
if not results.total:
|
||||
logging.info("~FGSearch ~FCstories~FG by title: ~SB%s" % query)
|
||||
q = FuzzyQuery('title', query)
|
||||
results = cls.ES.search(q)
|
||||
|
||||
if not results.total:
|
||||
logging.info("~FGSearch ~FCstories~FG by content: ~SB%s" % query)
|
||||
q = FuzzyQuery('content', query)
|
||||
results = cls.ES.search(q)
|
||||
|
||||
if not results.total:
|
||||
logging.info("~FGSearch ~FCstories~FG by author: ~SB%s" % query)
|
||||
q = FuzzyQuery('author', query)
|
||||
results = cls.ES.search(q)
|
||||
|
||||
return results
|
||||
|
||||
string_q = pyes.query.StringQuery(query, default_operator="AND")
|
||||
feed_q = pyes.query.TermsQuery('feed_id', feed_ids)
|
||||
q = pyes.query.BoolQuery(must=[string_q, feed_q])
|
||||
results = cls.ES.search(q, indices=cls.index_name(), doc_types=[cls.type_name()])
|
||||
logging.info("~FGSearch ~FCstories~FG for: ~SB%s (across %s feed%s)" %
|
||||
(query, len(feed_ids), 's' if len(feed_ids) != 1 else ''))
|
||||
|
||||
return [r.get_id() for r in results]
|
||||
|
||||
|
||||
class SearchFeed:
|
||||
|
|
|
@ -165,8 +165,6 @@ NEWSBLUR.Views.Folder = Backbone.View.extend({
|
|||
|
||||
if (this.options.feedbar) {
|
||||
this.show_collapsed_folder_count();
|
||||
}
|
||||
if (this.options.feedbar && NEWSBLUR.Globals.is_staff) {
|
||||
this.search_view = new NEWSBLUR.Views.FeedSearchView({
|
||||
feedbar_view: this
|
||||
}).render();
|
||||
|
|
|
@ -51,6 +51,7 @@ NEWSBLUR.Views.StoryTitlesHeader = Backbone.View.extend({
|
|||
} else if (this.showing_fake_folder) {
|
||||
$view = $(_.template('\
|
||||
<div class="NB-folder NB-no-hover">\
|
||||
<div class="NB-search-container"></div>\
|
||||
<% if (show_options) { %>\
|
||||
<div class="NB-feedbar-options-container">\
|
||||
<span class="NB-feedbar-options">\
|
||||
|
@ -75,6 +76,11 @@ NEWSBLUR.Views.StoryTitlesHeader = Backbone.View.extend({
|
|||
show_options: !NEWSBLUR.reader.active_folder.get('fake') ||
|
||||
NEWSBLUR.reader.active_folder.get('show_options')
|
||||
}));
|
||||
this.search_view = new NEWSBLUR.Views.FeedSearchView({
|
||||
feedbar_view: this
|
||||
}).render();
|
||||
this.search_view.blur_search();
|
||||
$(".NB-search-container", $view).html(this.search_view.$el);
|
||||
} else if (NEWSBLUR.reader.flags['river_view'] &&
|
||||
NEWSBLUR.reader.active_folder &&
|
||||
NEWSBLUR.reader.active_folder.get('folder_title')) {
|
||||
|
|
|
@ -242,7 +242,14 @@ def strip_comments__lxml(html_string=""):
|
|||
return lxml.etree.tostring(clean_html)
|
||||
except (XMLSyntaxError, ParserError):
|
||||
return html_string
|
||||
|
||||
|
||||
def prep_for_search(html):
|
||||
html = strip_tags_django(html)
|
||||
html = html.lower()
|
||||
html = xhtml_unescape_tornado(html)
|
||||
|
||||
return html[:100000]
|
||||
|
||||
def linkify(*args, **kwargs):
|
||||
return xhtml_unescape_tornado(linkify_tornado(*args, **kwargs))
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue