From bc0192c3df62e0a87b48fd93f349fd78736db5b9 Mon Sep 17 00:00:00 2001
From: Samuel Clay <samuel@ofbrooklyn.com>
Date: Fri, 4 Jan 2013 16:34:27 -0800
Subject: [PATCH] Adding broken search for feeds.

---
 apps/rss_feeds/models.py |  29 +++++++++-
 apps/rss_feeds/views.py  |   2 +-
 apps/search/models.py    | 122 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 146 insertions(+), 7 deletions(-)

diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py
index 8de3f6814..7b088f22a 100644
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@@ -22,7 +22,7 @@ from mongoengine.queryset import OperationError, Q
 from mongoengine.base import ValidationError
 from vendor.timezones.utilities import localtime_for_timezone
 from apps.rss_feeds.tasks import UpdateFeeds, PushFeeds
-from apps.search.models import SearchStarredStory
+from apps.search.models import SearchStarredStory, SearchFeed
 from utils import json_functions as json
 from utils import feedfinder, feedparser
 from utils import urlnorm
@@ -83,7 +83,12 @@ class Feed(models.Model):
         if not self.feed_title:
             self.feed_title = "[Untitled]"
             self.save()
-        return "%s (%s)" % (self.feed_title, self.pk)
+        return "%s (%s - %s/%s/%s)" % (
+            self.feed_title, 
+            self.pk, 
+            self.num_subscribers,
+            self.active_subscribers,
+            self.premium_subscribers)
     
     @property
     def title(self):
@@ -207,6 +212,14 @@ class Feed(models.Model):
                 
             return self
 
+    def index_for_search(self):
+        if self.num_subscribers > 1 and not self.branch_from_feed:
+            SearchFeed.index(feed_id=self.pk, 
+                             title=self.feed_title, 
+                             address=self.feed_address, 
+                             link=self.feed_link,
+                             num_subscribers=self.num_subscribers)
+    
     
     def sync_redis(self):
         return MStory.sync_all_redis(self.pk)
@@ -759,7 +772,17 @@ class Feed(models.Model):
                 duplicate_feeds = DuplicateFeed.objects.filter(duplicate_address=feed_address)
                 if duplicate_feeds:
                     return duplicate_feeds[0].feed
-                
+    
+    @classmethod
+    def get_by_name(cls, query, limit=1):
+        results = SearchFeed.query(query)
+        feed_ids = [result.feed_id for result in results]
+        
+        if limit == 1:
+            return Feed.get_by_id(feed_ids[0])
+        else:
+            return [Feed.get_by_id(f) for f in feed_ids][:limit]
+        
     def add_update_stories(self, stories, existing_stories, verbose=False):
         ret_values = dict(new=0, updated=0, same=0, error=0)
 
diff --git a/apps/rss_feeds/views.py b/apps/rss_feeds/views.py
index 6bf8c658c..0f09cb720 100644
--- a/apps/rss_feeds/views.py
+++ b/apps/rss_feeds/views.py
@@ -75,7 +75,7 @@ def feed_autocomplete(request):
         return dict(code=-1, message="Specify a search 'term'.")
         
     feeds = []
-    for field in ['feed_address', 'feed_link', 'feed_title']:
+    for field in ['feed_address', 'feed_title', 'feed_link']:
         if not feeds:
             feeds = Feed.objects.filter(**{
                 '%s__icontains' % field: query,
diff --git a/apps/search/models.py b/apps/search/models.py
index c951501b1..ca856ad67 100644
--- a/apps/search/models.py
+++ b/apps/search/models.py
@@ -1,4 +1,7 @@
 import pyes
+from pyes.query import FilteredQuery, FuzzyQuery, TextQuery, PrefixQuery
+from pyes.filters import RangeFilter
+from pyes.utils import ESRange
 from django.conf import settings
 from django.contrib.auth.models import User
 from utils import log as logging
@@ -76,17 +79,130 @@ class SearchStarredStory:
         
         if not results.total:
             logging.user(user, "~FGSearch ~FCsaved stories~FG by title: ~SB%s" % text)
-            q = pyes.query.FuzzyQuery('title', text)
+            q = FuzzyQuery('title', text)
             results = cls.ES.search(q)
             
         if not results.total:
             logging.user(user, "~FGSearch ~FCsaved stories~FG by content: ~SB%s" % text)
-            q = pyes.query.FuzzyQuery('content', text)
+            q = FuzzyQuery('content', text)
             results = cls.ES.search(q)
             
         if not results.total:
             logging.user(user, "~FGSearch ~FCsaved stories~FG by author: ~SB%s" % text)
-            q = pyes.query.FuzzyQuery('author', text)
+            q = FuzzyQuery('author', text)
             results = cls.ES.search(q)
             
         return results
+
+
+class SearchFeed:
+    
+    ES = pyes.ES(settings.ELASTICSEARCH_HOSTS)
+    name = "feeds"
+    
+    @classmethod
+    def create_elasticsearch_mapping(cls):
+        try:
+            cls.ES.delete_index("%s-index" % cls.name)
+        except pyes.TypeMissingException:
+            print "Index missing, can't delete: %s-index" % cls.name
+            
+        settings =  {
+            "index" : {
+              "analysis" : {
+                "analyzer" : {
+                  "url_analyzer" : {
+                    "type" : "custom",
+                    "tokenizer" : "urls",
+                    "filter"    : ["stop", "url_stop"]
+                  }
+                },
+                "tokenizer": {
+                    "urls": {
+                        "type": "uax_url_email",
+                        "max_token_length": 255,
+                    }
+                },
+                "filter" : {
+                  "url_stop" : {
+                    "type" : "stop",
+                    "stopwords" : ["http", "https"]
+                  },
+                  "url_ngram" : {
+                    "type" : "nGram",
+                    "min_gram" : 2,
+                    "max_gram" : 20,
+                  }
+                }
+              }
+            }
+          }
+        cls.ES.create_index("%s-index" % cls.name, settings)
+        mapping = { 
+            'address': {
+                'boost': 3.0,
+                'index': 'analyzed',
+                'store': 'yes',
+                'type': 'string',
+                "term_vector" : "with_positions_offsets",
+                "analyzer": "url_analyzer",
+            },
+            'title': {
+                'boost': 2.0,
+                'index': 'analyzed',
+                'store': 'yes',
+                'type': 'string',
+                "term_vector" : "with_positions_offsets",
+            },
+            'link': {
+                'boost': 1.0,
+                'index': 'analyzed',
+                'store': 'yes',
+                'type': 'string',
+                "term_vector" : "with_positions_offsets",
+                "analyzer": "url_analyzer",
+            },
+            'num_subscribers': {
+                'boost': 1.0,
+                'index': 'not_analyzed',
+                'store': 'yes',
+                'type': 'integer',
+            },
+            'feed_id': {
+                'store': 'yes',
+                'type': 'integer',
+            },
+        }
+        cls.ES.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
+        
+    @classmethod
+    def index(cls, feed_id, title, address, link, num_subscribers):
+        doc = {
+            "feed_id": feed_id,
+            "title": title,
+            "address": address,
+            "link": link,
+            "num_subscribers": num_subscribers,
+        }
+        cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, feed_id)
+        
+    @classmethod
+    def query(cls, text):
+        cls.ES.refresh()
+        
+        sub_filter = RangeFilter(qrange=ESRange('num_subscribers', 2))
+        logging.info("~FGSearch ~FCfeeds~FG by address: ~SB%s" % text)
+        q = TextQuery('address', text)
+        results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5)
+
+        if not results.total:
+            logging.info("~FGSearch ~FCfeeds~FG by title: ~SB%s" % text)
+            q = PrefixQuery('title', text)
+            results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5)
+            
+        if not results.total:
+            logging.info("~FGSearch ~FCfeeds~FG by link: ~SB%s" % text)
+            q = TextQuery('link.partial', text)
+            results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5)
+            
+        return results