From bc0192c3df62e0a87b48fd93f349fd78736db5b9 Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Fri, 4 Jan 2013 16:34:27 -0800 Subject: [PATCH] Adding broken search for feeds. --- apps/rss_feeds/models.py | 29 +++++++++- apps/rss_feeds/views.py | 2 +- apps/search/models.py | 122 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 146 insertions(+), 7 deletions(-) diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index 8de3f6814..7b088f22a 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -22,7 +22,7 @@ from mongoengine.queryset import OperationError, Q from mongoengine.base import ValidationError from vendor.timezones.utilities import localtime_for_timezone from apps.rss_feeds.tasks import UpdateFeeds, PushFeeds -from apps.search.models import SearchStarredStory +from apps.search.models import SearchStarredStory, SearchFeed from utils import json_functions as json from utils import feedfinder, feedparser from utils import urlnorm @@ -83,7 +83,12 @@ class Feed(models.Model): if not self.feed_title: self.feed_title = "[Untitled]" self.save() - return "%s (%s)" % (self.feed_title, self.pk) + return "%s (%s - %s/%s/%s)" % ( + self.feed_title, + self.pk, + self.num_subscribers, + self.active_subscribers, + self.premium_subscribers) @property def title(self): @@ -207,6 +212,14 @@ class Feed(models.Model): return self + def index_for_search(self): + if self.num_subscribers > 1 and not self.branch_from_feed: + SearchFeed.index(feed_id=self.pk, + title=self.feed_title, + address=self.feed_address, + link=self.feed_link, + num_subscribers=self.num_subscribers) + def sync_redis(self): return MStory.sync_all_redis(self.pk) @@ -759,7 +772,17 @@ class Feed(models.Model): duplicate_feeds = DuplicateFeed.objects.filter(duplicate_address=feed_address) if duplicate_feeds: return duplicate_feeds[0].feed - + + @classmethod + def get_by_name(cls, query, limit=1): + results = SearchFeed.query(query) + feed_ids = [result.feed_id for result in results] + + if limit == 1: + return Feed.get_by_id(feed_ids[0]) + else: + return [Feed.get_by_id(f) for f in feed_ids][:limit] + def add_update_stories(self, stories, existing_stories, verbose=False): ret_values = dict(new=0, updated=0, same=0, error=0) diff --git a/apps/rss_feeds/views.py b/apps/rss_feeds/views.py index 6bf8c658c..0f09cb720 100644 --- a/apps/rss_feeds/views.py +++ b/apps/rss_feeds/views.py @@ -75,7 +75,7 @@ def feed_autocomplete(request): return dict(code=-1, message="Specify a search 'term'.") feeds = [] - for field in ['feed_address', 'feed_link', 'feed_title']: + for field in ['feed_address', 'feed_title', 'feed_link']: if not feeds: feeds = Feed.objects.filter(**{ '%s__icontains' % field: query, diff --git a/apps/search/models.py b/apps/search/models.py index c951501b1..ca856ad67 100644 --- a/apps/search/models.py +++ b/apps/search/models.py @@ -1,4 +1,7 @@ import pyes +from pyes.query import FilteredQuery, FuzzyQuery, TextQuery, PrefixQuery +from pyes.filters import RangeFilter +from pyes.utils import ESRange from django.conf import settings from django.contrib.auth.models import User from utils import log as logging @@ -76,17 +79,130 @@ class SearchStarredStory: if not results.total: logging.user(user, "~FGSearch ~FCsaved stories~FG by title: ~SB%s" % text) - q = pyes.query.FuzzyQuery('title', text) + q = FuzzyQuery('title', text) results = cls.ES.search(q) if not results.total: logging.user(user, "~FGSearch ~FCsaved stories~FG by content: ~SB%s" % text) - q = pyes.query.FuzzyQuery('content', text) + q = FuzzyQuery('content', text) results = cls.ES.search(q) if not results.total: logging.user(user, "~FGSearch ~FCsaved stories~FG by author: ~SB%s" % text) - q = pyes.query.FuzzyQuery('author', text) + q = FuzzyQuery('author', text) results = cls.ES.search(q) return results + + +class SearchFeed: + + ES = pyes.ES(settings.ELASTICSEARCH_HOSTS) + name = "feeds" + + @classmethod + def create_elasticsearch_mapping(cls): + try: + cls.ES.delete_index("%s-index" % cls.name) + except pyes.TypeMissingException: + print "Index missing, can't delete: %s-index" % cls.name + + settings = { + "index" : { + "analysis" : { + "analyzer" : { + "url_analyzer" : { + "type" : "custom", + "tokenizer" : "urls", + "filter" : ["stop", "url_stop"] + } + }, + "tokenizer": { + "urls": { + "type": "uax_url_email", + "max_token_length": 255, + } + }, + "filter" : { + "url_stop" : { + "type" : "stop", + "stopwords" : ["http", "https"] + }, + "url_ngram" : { + "type" : "nGram", + "min_gram" : 2, + "max_gram" : 20, + } + } + } + } + } + cls.ES.create_index("%s-index" % cls.name, settings) + mapping = { + 'address': { + 'boost': 3.0, + 'index': 'analyzed', + 'store': 'yes', + 'type': 'string', + "term_vector" : "with_positions_offsets", + "analyzer": "url_analyzer", + }, + 'title': { + 'boost': 2.0, + 'index': 'analyzed', + 'store': 'yes', + 'type': 'string', + "term_vector" : "with_positions_offsets", + }, + 'link': { + 'boost': 1.0, + 'index': 'analyzed', + 'store': 'yes', + 'type': 'string', + "term_vector" : "with_positions_offsets", + "analyzer": "url_analyzer", + }, + 'num_subscribers': { + 'boost': 1.0, + 'index': 'not_analyzed', + 'store': 'yes', + 'type': 'integer', + }, + 'feed_id': { + 'store': 'yes', + 'type': 'integer', + }, + } + cls.ES.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name]) + + @classmethod + def index(cls, feed_id, title, address, link, num_subscribers): + doc = { + "feed_id": feed_id, + "title": title, + "address": address, + "link": link, + "num_subscribers": num_subscribers, + } + cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, feed_id) + + @classmethod + def query(cls, text): + cls.ES.refresh() + + sub_filter = RangeFilter(qrange=ESRange('num_subscribers', 2)) + logging.info("~FGSearch ~FCfeeds~FG by address: ~SB%s" % text) + q = TextQuery('address', text) + results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5) + + if not results.total: + logging.info("~FGSearch ~FCfeeds~FG by title: ~SB%s" % text) + q = PrefixQuery('title', text) + results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5) + + if not results.total: + logging.info("~FGSearch ~FCfeeds~FG by link: ~SB%s" % text) + q = TextQuery('link.partial', text) + results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5) + + return results