2016-01-07 19:16:39 -08:00
|
|
|
import re
|
2014-04-16 15:31:44 -07:00
|
|
|
import time
|
|
|
|
import datetime
|
2014-04-23 11:33:56 -07:00
|
|
|
import pymongo
|
2012-03-11 16:13:05 -07:00
|
|
|
import pyes
|
2014-04-16 15:31:44 -07:00
|
|
|
import redis
|
2014-04-22 18:33:08 -07:00
|
|
|
import celery
|
2014-04-16 15:31:44 -07:00
|
|
|
import mongoengine as mongo
|
2012-03-11 16:13:05 -07:00
|
|
|
from django.conf import settings
|
2014-04-16 15:31:44 -07:00
|
|
|
from django.contrib.auth.models import User
|
|
|
|
from apps.search.tasks import IndexSubscriptionsForSearch
|
2014-04-22 18:33:08 -07:00
|
|
|
from apps.search.tasks import IndexSubscriptionsChunkForSearch
|
2014-04-23 16:12:30 -07:00
|
|
|
from apps.search.tasks import IndexFeedsForSearch
|
2012-12-21 15:05:38 -08:00
|
|
|
from utils import log as logging
|
2014-04-22 18:33:08 -07:00
|
|
|
from utils.feed_functions import chunks
|
2012-03-11 16:13:05 -07:00
|
|
|
|
2014-04-16 15:31:44 -07:00
|
|
|
class MUserSearch(mongo.Document):
|
|
|
|
'''Search index state of a user's subscriptions.'''
|
|
|
|
user_id = mongo.IntField(unique=True)
|
|
|
|
last_search_date = mongo.DateTimeField()
|
|
|
|
subscriptions_indexed = mongo.BooleanField()
|
|
|
|
subscriptions_indexing = mongo.BooleanField()
|
|
|
|
|
|
|
|
meta = {
|
|
|
|
'collection': 'user_search',
|
|
|
|
'indexes': ['user_id'],
|
|
|
|
'allow_inheritance': False,
|
|
|
|
}
|
|
|
|
|
|
|
|
@classmethod
|
2014-04-23 16:12:30 -07:00
|
|
|
def get_user(cls, user_id, create=True):
|
2014-04-16 15:31:44 -07:00
|
|
|
try:
|
2014-04-23 11:33:56 -07:00
|
|
|
user_search = cls.objects.read_preference(pymongo.ReadPreference.PRIMARY)\
|
|
|
|
.get(user_id=user_id)
|
2014-04-16 15:31:44 -07:00
|
|
|
except cls.DoesNotExist:
|
2014-04-23 16:12:30 -07:00
|
|
|
if create:
|
|
|
|
user_search = cls.objects.create(user_id=user_id)
|
|
|
|
else:
|
|
|
|
user_search = None
|
2014-04-16 15:31:44 -07:00
|
|
|
|
|
|
|
return user_search
|
|
|
|
|
|
|
|
def touch_search_date(self):
|
2014-04-23 15:05:47 -07:00
|
|
|
if not self.subscriptions_indexed and not self.subscriptions_indexing:
|
|
|
|
self.schedule_index_subscriptions_for_search()
|
|
|
|
self.subscriptions_indexing = True
|
2014-04-16 15:31:44 -07:00
|
|
|
|
|
|
|
self.last_search_date = datetime.datetime.now()
|
|
|
|
self.save()
|
|
|
|
|
|
|
|
def schedule_index_subscriptions_for_search(self):
|
2020-10-05 00:45:20 +07:00
|
|
|
IndexSubscriptionsForSearch().apply_async(kwargs=dict(user_id=self.user_id),
|
2014-04-22 18:33:08 -07:00
|
|
|
queue='search_indexer_tasker')
|
2014-04-16 15:31:44 -07:00
|
|
|
|
|
|
|
# Should be run as a background task
|
|
|
|
def index_subscriptions_for_search(self):
|
|
|
|
from apps.rss_feeds.models import Feed
|
|
|
|
from apps.reader.models import UserSubscription
|
|
|
|
|
|
|
|
SearchStory.create_elasticsearch_mapping()
|
|
|
|
|
|
|
|
start = time.time()
|
|
|
|
user = User.objects.get(pk=self.user_id)
|
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)
|
|
|
|
r.publish(user.username, 'search_index_complete:start')
|
|
|
|
|
2014-04-22 18:33:08 -07:00
|
|
|
subscriptions = UserSubscription.objects.filter(user=user).only('feed')
|
2014-04-16 15:31:44 -07:00
|
|
|
total = subscriptions.count()
|
|
|
|
|
2014-04-22 18:33:08 -07:00
|
|
|
feed_ids = []
|
2014-04-16 15:31:44 -07:00
|
|
|
for sub in subscriptions:
|
|
|
|
try:
|
2014-04-22 18:33:08 -07:00
|
|
|
feed_ids.append(sub.feed.pk)
|
2014-04-16 15:31:44 -07:00
|
|
|
except Feed.DoesNotExist:
|
|
|
|
continue
|
2014-04-22 18:33:08 -07:00
|
|
|
|
|
|
|
feed_id_chunks = [c for c in chunks(feed_ids, 6)]
|
|
|
|
logging.user(user, "~FCIndexing ~SB%s feeds~SN in %s chunks..." %
|
|
|
|
(total, len(feed_id_chunks)))
|
|
|
|
|
2020-12-07 16:36:17 -05:00
|
|
|
tasks = [IndexSubscriptionsChunkForSearch.s(feed_ids=feed_id_chunk,
|
2014-04-22 18:33:08 -07:00
|
|
|
user_id=self.user_id
|
|
|
|
).set(queue='search_indexer')
|
|
|
|
for feed_id_chunk in feed_id_chunks]
|
|
|
|
group = celery.group(*tasks)
|
|
|
|
res = group.apply_async(queue='search_indexer')
|
|
|
|
res.join_native()
|
2014-04-16 15:31:44 -07:00
|
|
|
|
|
|
|
duration = time.time() - start
|
2014-04-22 18:33:08 -07:00
|
|
|
logging.user(user, "~FCIndexed ~SB%s feeds~SN in ~FM~SB%s~FC~SN sec." %
|
|
|
|
(total, round(duration, 2)))
|
2014-04-16 15:31:44 -07:00
|
|
|
r.publish(user.username, 'search_index_complete:done')
|
|
|
|
|
|
|
|
self.subscriptions_indexed = True
|
|
|
|
self.subscriptions_indexing = False
|
|
|
|
self.save()
|
2014-04-16 18:19:59 -07:00
|
|
|
|
2014-04-22 18:33:08 -07:00
|
|
|
def index_subscriptions_chunk_for_search(self, feed_ids):
|
|
|
|
from apps.rss_feeds.models import Feed
|
|
|
|
r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)
|
|
|
|
user = User.objects.get(pk=self.user_id)
|
|
|
|
|
|
|
|
logging.user(user, "~FCIndexing %s feeds..." % len(feed_ids))
|
|
|
|
|
|
|
|
for feed_id in feed_ids:
|
|
|
|
feed = Feed.get_by_id(feed_id)
|
|
|
|
if not feed: continue
|
|
|
|
|
|
|
|
feed.index_stories_for_search()
|
|
|
|
|
|
|
|
r.publish(user.username, 'search_index_complete:feeds:%s' %
|
|
|
|
','.join([str(f) for f in feed_ids]))
|
2014-04-23 16:12:30 -07:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def schedule_index_feeds_for_search(cls, feed_ids, user_id):
|
|
|
|
user_search = cls.get_user(user_id, create=False)
|
|
|
|
if (not user_search or
|
|
|
|
not user_search.subscriptions_indexed or
|
|
|
|
user_search.subscriptions_indexing):
|
|
|
|
# User hasn't searched before.
|
|
|
|
return
|
|
|
|
|
|
|
|
if not isinstance(feed_ids, list):
|
|
|
|
feed_ids = [feed_ids]
|
2020-10-05 00:45:20 +07:00
|
|
|
IndexFeedsForSearch().apply_async(kwargs=dict(feed_ids=feed_ids, user_id=user_id),
|
2014-04-23 16:12:30 -07:00
|
|
|
queue='search_indexer')
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def index_feeds_for_search(cls, feed_ids, user_id):
|
|
|
|
from apps.rss_feeds.models import Feed
|
|
|
|
user = User.objects.get(pk=user_id)
|
|
|
|
|
|
|
|
logging.user(user, "~SB~FCIndexing %s~FC by request..." % feed_ids)
|
|
|
|
|
|
|
|
for feed_id in feed_ids:
|
|
|
|
feed = Feed.get_by_id(feed_id)
|
|
|
|
if not feed: continue
|
|
|
|
|
|
|
|
feed.index_stories_for_search()
|
2014-04-22 18:33:08 -07:00
|
|
|
|
2014-04-16 18:19:59 -07:00
|
|
|
@classmethod
|
2015-01-10 12:01:29 -08:00
|
|
|
def remove_all(cls, drop_index=False):
|
2015-07-30 13:30:53 -07:00
|
|
|
# You only need to drop the index if there is data you want to clear.
|
|
|
|
# A new search server won't need this, as there isn't anything to drop.
|
|
|
|
if drop_index:
|
|
|
|
logging.info(" ---> ~FRRemoving stories search index...")
|
|
|
|
SearchStory.drop()
|
|
|
|
|
2014-04-22 12:39:09 -07:00
|
|
|
user_searches = cls.objects.all()
|
|
|
|
logging.info(" ---> ~SN~FRRemoving ~SB%s~SN user searches..." % user_searches.count())
|
|
|
|
for user_search in user_searches:
|
2015-01-10 12:03:44 -08:00
|
|
|
try:
|
|
|
|
user_search.remove()
|
2020-06-15 03:35:08 -04:00
|
|
|
except Exception as e:
|
|
|
|
print(" ****> Error on search removal: %s" % e)
|
2015-01-10 12:01:29 -08:00
|
|
|
|
2014-04-16 18:19:59 -07:00
|
|
|
def remove(self):
|
|
|
|
from apps.rss_feeds.models import Feed
|
|
|
|
from apps.reader.models import UserSubscription
|
|
|
|
|
|
|
|
user = User.objects.get(pk=self.user_id)
|
2015-07-30 13:13:28 -07:00
|
|
|
subscriptions = UserSubscription.objects.filter(user=self.user_id)
|
2014-04-16 18:19:59 -07:00
|
|
|
total = subscriptions.count()
|
|
|
|
removed = 0
|
|
|
|
|
|
|
|
for sub in subscriptions:
|
|
|
|
try:
|
|
|
|
feed = sub.feed
|
|
|
|
except Feed.DoesNotExist:
|
|
|
|
continue
|
2015-07-30 13:10:42 -07:00
|
|
|
if not feed.search_indexed:
|
|
|
|
continue
|
2014-04-16 18:19:59 -07:00
|
|
|
feed.search_indexed = False
|
|
|
|
feed.save()
|
|
|
|
removed += 1
|
|
|
|
|
|
|
|
logging.user(user, "~FCRemoved ~SB%s/%s feed's search indexes~SN for ~SB~FB%s~FC~SN." %
|
|
|
|
(removed, total, user.username))
|
|
|
|
self.delete()
|
2014-04-16 15:31:44 -07:00
|
|
|
|
2014-04-11 15:40:58 -07:00
|
|
|
class SearchStory:
|
2012-03-11 16:13:05 -07:00
|
|
|
|
2014-04-22 15:15:42 -07:00
|
|
|
ES = pyes.ES(settings.ELASTICSEARCH_STORY_HOSTS)
|
2014-04-11 15:40:58 -07:00
|
|
|
name = "stories"
|
2012-03-11 16:13:05 -07:00
|
|
|
|
2014-04-11 15:40:58 -07:00
|
|
|
@classmethod
|
|
|
|
def index_name(cls):
|
|
|
|
return "%s-index" % cls.name
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def type_name(cls):
|
|
|
|
return "%s-type" % cls.name
|
|
|
|
|
2012-03-11 16:13:05 -07:00
|
|
|
@classmethod
|
2014-04-16 15:31:44 -07:00
|
|
|
def create_elasticsearch_mapping(cls, delete=False):
|
|
|
|
if delete:
|
|
|
|
cls.ES.indices.delete_index_if_exists("%s-index" % cls.name)
|
|
|
|
cls.ES.indices.create_index_if_missing("%s-index" % cls.name)
|
2012-03-11 16:13:05 -07:00
|
|
|
mapping = {
|
|
|
|
'title': {
|
2014-04-15 16:52:25 -07:00
|
|
|
'boost': 3.0,
|
2012-03-11 16:13:05 -07:00
|
|
|
'index': 'analyzed',
|
2014-04-15 14:17:15 -07:00
|
|
|
'store': 'no',
|
2012-03-11 16:13:05 -07:00
|
|
|
'type': 'string',
|
2015-07-17 14:01:38 -07:00
|
|
|
'analyzer': 'standard',
|
2012-03-11 16:13:05 -07:00
|
|
|
},
|
|
|
|
'content': {
|
|
|
|
'boost': 1.0,
|
|
|
|
'index': 'analyzed',
|
2014-04-15 14:17:15 -07:00
|
|
|
'store': 'no',
|
2012-03-11 16:13:05 -07:00
|
|
|
'type': 'string',
|
2015-07-17 14:01:38 -07:00
|
|
|
'analyzer': 'simple',
|
2012-03-11 16:13:05 -07:00
|
|
|
},
|
2014-04-22 15:15:42 -07:00
|
|
|
'tags': {
|
|
|
|
'boost': 2.0,
|
|
|
|
'index': 'analyzed',
|
|
|
|
'store': 'no',
|
|
|
|
'type': 'string',
|
2015-07-17 14:01:38 -07:00
|
|
|
'analyzer': 'standard',
|
2014-04-22 15:15:42 -07:00
|
|
|
},
|
2012-03-11 16:13:05 -07:00
|
|
|
'author': {
|
|
|
|
'boost': 1.0,
|
|
|
|
'index': 'analyzed',
|
2014-04-15 14:17:15 -07:00
|
|
|
'store': 'no',
|
2012-12-20 16:07:22 -08:00
|
|
|
'type': 'string',
|
2015-07-17 14:01:38 -07:00
|
|
|
'analyzer': 'simple',
|
2012-03-11 16:13:05 -07:00
|
|
|
},
|
|
|
|
'feed_id': {
|
2014-04-15 14:17:15 -07:00
|
|
|
'store': 'no',
|
2012-03-11 16:13:05 -07:00
|
|
|
'type': 'integer'
|
|
|
|
},
|
|
|
|
'date': {
|
2014-04-15 14:17:15 -07:00
|
|
|
'store': 'no',
|
2012-03-11 16:13:05 -07:00
|
|
|
'type': 'date',
|
|
|
|
}
|
|
|
|
}
|
2014-04-16 18:19:59 -07:00
|
|
|
cls.ES.indices.put_mapping("%s-type" % cls.name, {
|
|
|
|
'properties': mapping,
|
|
|
|
'_source': {'enabled': False},
|
|
|
|
}, ["%s-index" % cls.name])
|
2012-03-11 16:13:05 -07:00
|
|
|
|
|
|
|
@classmethod
|
2014-04-22 15:15:42 -07:00
|
|
|
def index(cls, story_hash, story_title, story_content, story_tags, story_author, story_feed_id,
|
2014-04-15 14:17:15 -07:00
|
|
|
story_date):
|
2012-03-11 16:13:05 -07:00
|
|
|
doc = {
|
2014-04-15 14:59:00 -07:00
|
|
|
"content" : story_content,
|
|
|
|
"title" : story_title,
|
2014-04-22 15:15:42 -07:00
|
|
|
"tags" : ', '.join(story_tags),
|
2014-04-15 14:59:00 -07:00
|
|
|
"author" : story_author,
|
|
|
|
"feed_id" : story_feed_id,
|
|
|
|
"date" : story_date,
|
2012-03-11 16:13:05 -07:00
|
|
|
}
|
2014-05-15 12:24:28 -07:00
|
|
|
try:
|
|
|
|
cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, story_hash)
|
|
|
|
except pyes.exceptions.NoServerAvailable:
|
|
|
|
logging.debug(" ***> ~FRNo search server available.")
|
2014-04-15 14:59:00 -07:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def remove(cls, story_hash):
|
2014-05-15 12:24:28 -07:00
|
|
|
try:
|
|
|
|
cls.ES.delete("%s-index" % cls.name, "%s-type" % cls.name, story_hash)
|
|
|
|
except pyes.exceptions.NoServerAvailable:
|
|
|
|
logging.debug(" ***> ~FRNo search server available.")
|
2012-03-11 16:13:05 -07:00
|
|
|
|
2014-04-22 12:39:09 -07:00
|
|
|
@classmethod
|
|
|
|
def drop(cls):
|
2014-04-22 18:33:08 -07:00
|
|
|
cls.ES.indices.delete_index_if_exists("%s-index" % cls.name)
|
2014-04-22 12:39:09 -07:00
|
|
|
|
2012-03-11 16:13:05 -07:00
|
|
|
@classmethod
|
2017-01-08 19:45:53 -08:00
|
|
|
def query(cls, feed_ids, query, order, offset, limit, strip=False):
|
2014-04-16 18:19:59 -07:00
|
|
|
cls.create_elasticsearch_mapping()
|
2014-04-11 18:02:25 -07:00
|
|
|
cls.ES.indices.refresh()
|
2017-01-08 19:45:53 -08:00
|
|
|
|
|
|
|
if strip:
|
|
|
|
query = re.sub(r'([^\s\w_\-])+', ' ', query) # Strip non-alphanumeric
|
2014-04-16 17:21:53 -07:00
|
|
|
sort = "date:desc" if order == "newest" else "date:asc"
|
2015-07-17 14:01:38 -07:00
|
|
|
string_q = pyes.query.QueryStringQuery(query, default_operator="AND")
|
2014-05-05 13:32:44 -07:00
|
|
|
feed_q = pyes.query.TermsQuery('feed_id', feed_ids[:1000])
|
2014-04-15 14:59:00 -07:00
|
|
|
q = pyes.query.BoolQuery(must=[string_q, feed_q])
|
2014-05-15 12:24:28 -07:00
|
|
|
try:
|
|
|
|
results = cls.ES.search(q, indices=cls.index_name(), doc_types=[cls.type_name()],
|
|
|
|
partial_fields={}, sort=sort, start=offset, size=limit)
|
|
|
|
except pyes.exceptions.NoServerAvailable:
|
|
|
|
logging.debug(" ***> ~FRNo search server available.")
|
|
|
|
return []
|
|
|
|
|
2014-04-22 12:39:09 -07:00
|
|
|
logging.info(" ---> ~FG~SNSearch ~FCstories~FG for: ~SB%s~SN (across %s feed%s)" %
|
2014-04-15 14:17:15 -07:00
|
|
|
(query, len(feed_ids), 's' if len(feed_ids) != 1 else ''))
|
2014-04-15 16:52:25 -07:00
|
|
|
|
2016-05-26 14:52:42 -07:00
|
|
|
try:
|
|
|
|
result_ids = [r.get_id() for r in results]
|
2020-06-15 03:35:08 -04:00
|
|
|
except pyes.InvalidQuery as e:
|
2016-05-26 14:52:42 -07:00
|
|
|
logging.info(" ---> ~FRInvalid search query \"%s\": %s" % (query, e))
|
|
|
|
return []
|
|
|
|
|
|
|
|
return result_ids
|
2016-10-06 14:37:28 -07:00
|
|
|
|
|
|
|
@classmethod
|
2017-01-08 19:45:53 -08:00
|
|
|
def global_query(cls, query, order, offset, limit, strip=False):
|
2016-10-06 14:37:28 -07:00
|
|
|
cls.create_elasticsearch_mapping()
|
|
|
|
cls.ES.indices.refresh()
|
2017-01-08 19:45:53 -08:00
|
|
|
|
|
|
|
if strip:
|
|
|
|
query = re.sub(r'([^\s\w_\-])+', ' ', query) # Strip non-alphanumeric
|
2016-10-06 14:37:28 -07:00
|
|
|
sort = "date:desc" if order == "newest" else "date:asc"
|
|
|
|
string_q = pyes.query.QueryStringQuery(query, default_operator="AND")
|
|
|
|
try:
|
2017-01-08 19:45:53 -08:00
|
|
|
results = cls.ES.search(string_q, indices=cls.index_name(), doc_types=[cls.type_name()],
|
2016-10-06 14:37:28 -07:00
|
|
|
partial_fields={}, sort=sort, start=offset, size=limit)
|
|
|
|
except pyes.exceptions.NoServerAvailable:
|
|
|
|
logging.debug(" ***> ~FRNo search server available.")
|
|
|
|
return []
|
|
|
|
|
|
|
|
logging.info(" ---> ~FG~SNSearch ~FCstories~FG for: ~SB%s~SN (across all feeds)" %
|
|
|
|
(query))
|
|
|
|
|
|
|
|
try:
|
|
|
|
result_ids = [r.get_id() for r in results]
|
2020-06-15 03:35:08 -04:00
|
|
|
except pyes.InvalidQuery as e:
|
2016-10-06 14:37:28 -07:00
|
|
|
logging.info(" ---> ~FRInvalid search query \"%s\": %s" % (query, e))
|
|
|
|
return []
|
|
|
|
|
|
|
|
return result_ids
|
|
|
|
|
2013-01-04 16:34:27 -08:00
|
|
|
|
|
|
|
class SearchFeed:
|
|
|
|
|
2015-07-29 22:04:34 -07:00
|
|
|
_es_client = None
|
2013-01-04 16:34:27 -08:00
|
|
|
name = "feeds"
|
2015-07-29 22:04:34 -07:00
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def ES(cls):
|
|
|
|
if cls._es_client is None:
|
|
|
|
cls._es_client = pyes.ES(settings.ELASTICSEARCH_FEED_HOSTS)
|
|
|
|
if not cls._es_client.indices.exists_index(cls.index_name()):
|
|
|
|
cls.create_elasticsearch_mapping()
|
|
|
|
return cls._es_client
|
2013-01-04 16:34:27 -08:00
|
|
|
|
2014-04-11 15:40:58 -07:00
|
|
|
@classmethod
|
|
|
|
def index_name(cls):
|
|
|
|
return "%s-index" % cls.name
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def type_name(cls):
|
|
|
|
return "%s-type" % cls.name
|
2015-07-29 22:04:34 -07:00
|
|
|
|
2013-01-04 16:34:27 -08:00
|
|
|
@classmethod
|
2014-04-16 15:31:44 -07:00
|
|
|
def create_elasticsearch_mapping(cls, delete=False):
|
|
|
|
if delete:
|
2015-07-29 22:04:34 -07:00
|
|
|
cls.ES().indices.delete_index_if_exists(cls.index_name())
|
2015-07-30 12:47:44 -07:00
|
|
|
|
2013-01-04 16:34:27 -08:00
|
|
|
settings = {
|
2015-07-29 22:04:34 -07:00
|
|
|
"index" : {
|
|
|
|
"analysis": {
|
|
|
|
"analyzer": {
|
|
|
|
"edgengram_analyzer": {
|
|
|
|
"filter": ["edgengram"],
|
|
|
|
"tokenizer": "lowercase",
|
|
|
|
"type": "custom"
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"filter": {
|
|
|
|
"edgengram": {
|
|
|
|
"max_gram": "15",
|
|
|
|
"min_gram": "1",
|
|
|
|
"type": "edgeNGram"
|
|
|
|
},
|
2013-01-04 16:34:27 -08:00
|
|
|
}
|
2014-04-11 16:04:30 -07:00
|
|
|
}
|
|
|
|
}
|
2014-04-11 15:40:58 -07:00
|
|
|
}
|
2015-07-29 22:04:34 -07:00
|
|
|
cls.ES().indices.create_index_if_missing(cls.index_name(), settings)
|
2014-04-11 15:40:58 -07:00
|
|
|
|
2014-04-11 16:04:30 -07:00
|
|
|
mapping = {
|
|
|
|
"address": {
|
2015-07-29 22:04:34 -07:00
|
|
|
"analyzer": "edgengram_analyzer",
|
2015-07-30 11:51:21 -07:00
|
|
|
"store": False,
|
2015-07-29 22:04:34 -07:00
|
|
|
"term_vector": "with_positions_offsets",
|
2014-04-11 16:04:30 -07:00
|
|
|
"type": "string"
|
2013-01-04 16:34:27 -08:00
|
|
|
},
|
2014-04-11 16:04:30 -07:00
|
|
|
"feed_id": {
|
2015-07-29 22:04:34 -07:00
|
|
|
"store": True,
|
|
|
|
"type": "string"
|
2013-01-04 16:34:27 -08:00
|
|
|
},
|
2014-04-11 16:04:30 -07:00
|
|
|
"num_subscribers": {
|
|
|
|
"index": "analyzed",
|
2015-07-29 22:04:34 -07:00
|
|
|
"store": True,
|
2014-04-11 16:04:30 -07:00
|
|
|
"type": "long"
|
2013-01-04 16:34:27 -08:00
|
|
|
},
|
2014-04-11 16:04:30 -07:00
|
|
|
"title": {
|
2015-07-29 22:04:34 -07:00
|
|
|
"analyzer": "edgengram_analyzer",
|
2015-07-30 11:51:21 -07:00
|
|
|
"store": False,
|
2015-07-29 22:04:34 -07:00
|
|
|
"term_vector": "with_positions_offsets",
|
|
|
|
"type": "string"
|
|
|
|
},
|
|
|
|
"link": {
|
|
|
|
"analyzer": "edgengram_analyzer",
|
2015-07-30 11:51:21 -07:00
|
|
|
"store": False,
|
2015-07-29 22:04:34 -07:00
|
|
|
"term_vector": "with_positions_offsets",
|
2014-04-11 16:04:30 -07:00
|
|
|
"type": "string"
|
|
|
|
}
|
2013-01-04 16:34:27 -08:00
|
|
|
}
|
2015-07-29 22:04:34 -07:00
|
|
|
cls.ES().indices.put_mapping(cls.type_name(), {
|
2014-04-16 18:19:59 -07:00
|
|
|
'properties': mapping,
|
2015-07-29 22:04:34 -07:00
|
|
|
}, [cls.index_name()])
|
|
|
|
cls.ES().indices.flush()
|
|
|
|
|
2013-01-04 16:34:27 -08:00
|
|
|
@classmethod
|
|
|
|
def index(cls, feed_id, title, address, link, num_subscribers):
|
|
|
|
doc = {
|
2014-04-15 14:59:00 -07:00
|
|
|
"feed_id" : feed_id,
|
|
|
|
"title" : title,
|
|
|
|
"address" : address,
|
|
|
|
"link" : link,
|
|
|
|
"num_subscribers" : num_subscribers,
|
2013-01-04 16:34:27 -08:00
|
|
|
}
|
2014-05-15 12:24:28 -07:00
|
|
|
try:
|
2015-07-29 22:04:34 -07:00
|
|
|
cls.ES().index(doc, cls.index_name(), cls.type_name(), feed_id)
|
2014-05-15 12:24:28 -07:00
|
|
|
except pyes.exceptions.NoServerAvailable:
|
|
|
|
logging.debug(" ***> ~FRNo search server available.")
|
2015-07-29 22:04:34 -07:00
|
|
|
|
2013-01-04 16:34:27 -08:00
|
|
|
@classmethod
|
2015-08-17 18:06:29 -07:00
|
|
|
def query(cls, text, max_subscribers=5):
|
2014-05-15 12:24:28 -07:00
|
|
|
try:
|
2015-07-29 22:04:34 -07:00
|
|
|
cls.ES().default_indices = cls.index_name()
|
|
|
|
cls.ES().indices.refresh()
|
2014-05-15 12:24:28 -07:00
|
|
|
except pyes.exceptions.NoServerAvailable:
|
|
|
|
logging.debug(" ***> ~FRNo search server available.")
|
|
|
|
return []
|
2015-08-17 18:06:29 -07:00
|
|
|
|
|
|
|
if settings.DEBUG:
|
|
|
|
max_subscribers = 1
|
|
|
|
|
2015-07-29 22:04:34 -07:00
|
|
|
logging.info("~FGSearch ~FCfeeds~FG: ~SB%s" % text)
|
2015-07-30 16:36:06 -07:00
|
|
|
q = pyes.query.BoolQuery()
|
2015-07-30 16:48:11 -07:00
|
|
|
q.add_should(pyes.query.MatchQuery('address', text, analyzer="simple", cutoff_frequency=0.0005, minimum_should_match="75%"))
|
|
|
|
q.add_should(pyes.query.MatchQuery('link', text, analyzer="simple", cutoff_frequency=0.0005, minimum_should_match="75%"))
|
|
|
|
q.add_should(pyes.query.MatchQuery('title', text, analyzer="simple", cutoff_frequency=0.0005, minimum_should_match="75%"))
|
2015-07-30 18:14:12 -07:00
|
|
|
q = pyes.Search(q, min_score=1)
|
2015-08-17 18:06:29 -07:00
|
|
|
results = cls.ES().search(query=q, size=max_subscribers, doc_types=[cls.type_name()], sort="num_subscribers:desc")
|
2015-07-29 22:04:34 -07:00
|
|
|
|
2013-01-04 16:34:27 -08:00
|
|
|
return results
|
2015-07-14 16:20:46 -07:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def export_csv(cls):
|
|
|
|
import djqscsv
|
2017-01-08 19:45:53 -08:00
|
|
|
from apps.rss_feeds.models import Feed
|
2015-07-14 16:20:46 -07:00
|
|
|
|
|
|
|
qs = Feed.objects.filter(num_subscribers__gte=20).values('id', 'feed_title', 'feed_address', 'feed_link', 'num_subscribers')
|
|
|
|
csv = djqscsv.render_to_csv_response(qs).content
|
|
|
|
f = open('feeds.csv', 'w+')
|
|
|
|
f.write(csv)
|
|
|
|
f.close()
|
|
|
|
|