NewsBlur-viq/apps/search/models.py
Samuel Clay 05756155b1 Merge branch 'django3.0' into docker_django3.0
* django3.0: (184 commits)
  Removing log override
  Moving logging over to the newsblur log.
  Fixing search indexer background task for new celery.
  Attempting to add gunicorn errors to console/log.
  Better handling of missing subs.
  Handling missing user sub on feed delete.
  Correct encoding for strings on systems that don't have utf-8 as default encoding.
  Writing in the real urllib3 dependency for requests.
  Upgrading requests due to urllib3 incompatibility.
  Login required should use the next parameter.
  Upgrading django oauth toolkit for django 1.11.
  Handling newsletters with multiple recipients.
  Extracting image urls sometimes fails.
  Handling ajax errors in json views.
  Adding timeouts to most outbound requests.
  Sentry SDK 0.19.4.
  Removing imperfect proxy warning for every story.
  Found four more GET/POST crosses.
  Feed unread count may need a POST.
  Namespacing settings.
  ...
2020-12-08 13:04:39 -05:00

455 lines
No EOL
16 KiB
Python

import re
import time
import datetime
import pymongo
import pyes
import redis
import celery
import mongoengine as mongo
from django.conf import settings
from django.contrib.auth.models import User
from apps.search.tasks import IndexSubscriptionsForSearch
from apps.search.tasks import IndexSubscriptionsChunkForSearch
from apps.search.tasks import IndexFeedsForSearch
from utils import log as logging
from utils.feed_functions import chunks
class MUserSearch(mongo.Document):
'''Search index state of a user's subscriptions.'''
user_id = mongo.IntField(unique=True)
last_search_date = mongo.DateTimeField()
subscriptions_indexed = mongo.BooleanField()
subscriptions_indexing = mongo.BooleanField()
meta = {
'collection': 'user_search',
'indexes': ['user_id'],
'allow_inheritance': False,
}
@classmethod
def get_user(cls, user_id, create=True):
try:
user_search = cls.objects.read_preference(pymongo.ReadPreference.PRIMARY)\
.get(user_id=user_id)
except cls.DoesNotExist:
if create:
user_search = cls.objects.create(user_id=user_id)
else:
user_search = None
return user_search
def touch_search_date(self):
if not self.subscriptions_indexed and not self.subscriptions_indexing:
self.schedule_index_subscriptions_for_search()
self.subscriptions_indexing = True
self.last_search_date = datetime.datetime.now()
self.save()
def schedule_index_subscriptions_for_search(self):
IndexSubscriptionsForSearch().apply_async(kwargs=dict(user_id=self.user_id),
queue='search_indexer_tasker')
# Should be run as a background task
def index_subscriptions_for_search(self):
from apps.rss_feeds.models import Feed
from apps.reader.models import UserSubscription
SearchStory.create_elasticsearch_mapping()
start = time.time()
user = User.objects.get(pk=self.user_id)
r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)
r.publish(user.username, 'search_index_complete:start')
subscriptions = UserSubscription.objects.filter(user=user).only('feed')
total = subscriptions.count()
feed_ids = []
for sub in subscriptions:
try:
feed_ids.append(sub.feed.pk)
except Feed.DoesNotExist:
continue
feed_id_chunks = [c for c in chunks(feed_ids, 6)]
logging.user(user, "~FCIndexing ~SB%s feeds~SN in %s chunks..." %
(total, len(feed_id_chunks)))
tasks = [IndexSubscriptionsChunkForSearch.s(feed_ids=feed_id_chunk,
user_id=self.user_id
).set(queue='search_indexer')
for feed_id_chunk in feed_id_chunks]
group = celery.group(*tasks)
res = group.apply_async(queue='search_indexer')
res.join_native()
duration = time.time() - start
logging.user(user, "~FCIndexed ~SB%s feeds~SN in ~FM~SB%s~FC~SN sec." %
(total, round(duration, 2)))
r.publish(user.username, 'search_index_complete:done')
self.subscriptions_indexed = True
self.subscriptions_indexing = False
self.save()
def index_subscriptions_chunk_for_search(self, feed_ids):
from apps.rss_feeds.models import Feed
r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)
user = User.objects.get(pk=self.user_id)
logging.user(user, "~FCIndexing %s feeds..." % len(feed_ids))
for feed_id in feed_ids:
feed = Feed.get_by_id(feed_id)
if not feed: continue
feed.index_stories_for_search()
r.publish(user.username, 'search_index_complete:feeds:%s' %
','.join([str(f) for f in feed_ids]))
@classmethod
def schedule_index_feeds_for_search(cls, feed_ids, user_id):
user_search = cls.get_user(user_id, create=False)
if (not user_search or
not user_search.subscriptions_indexed or
user_search.subscriptions_indexing):
# User hasn't searched before.
return
if not isinstance(feed_ids, list):
feed_ids = [feed_ids]
IndexFeedsForSearch().apply_async(kwargs=dict(feed_ids=feed_ids, user_id=user_id),
queue='search_indexer')
@classmethod
def index_feeds_for_search(cls, feed_ids, user_id):
from apps.rss_feeds.models import Feed
user = User.objects.get(pk=user_id)
logging.user(user, "~SB~FCIndexing %s~FC by request..." % feed_ids)
for feed_id in feed_ids:
feed = Feed.get_by_id(feed_id)
if not feed: continue
feed.index_stories_for_search()
@classmethod
def remove_all(cls, drop_index=False):
# You only need to drop the index if there is data you want to clear.
# A new search server won't need this, as there isn't anything to drop.
if drop_index:
logging.info(" ---> ~FRRemoving stories search index...")
SearchStory.drop()
user_searches = cls.objects.all()
logging.info(" ---> ~SN~FRRemoving ~SB%s~SN user searches..." % user_searches.count())
for user_search in user_searches:
try:
user_search.remove()
except Exception as e:
print(" ****> Error on search removal: %s" % e)
def remove(self):
from apps.rss_feeds.models import Feed
from apps.reader.models import UserSubscription
user = User.objects.get(pk=self.user_id)
subscriptions = UserSubscription.objects.filter(user=self.user_id)
total = subscriptions.count()
removed = 0
for sub in subscriptions:
try:
feed = sub.feed
except Feed.DoesNotExist:
continue
if not feed.search_indexed:
continue
feed.search_indexed = False
feed.save()
removed += 1
logging.user(user, "~FCRemoved ~SB%s/%s feed's search indexes~SN for ~SB~FB%s~FC~SN." %
(removed, total, user.username))
self.delete()
class SearchStory:
ES = pyes.ES(settings.ELASTICSEARCH_STORY_HOSTS)
name = "stories"
@classmethod
def index_name(cls):
return "%s-index" % cls.name
@classmethod
def type_name(cls):
return "%s-type" % cls.name
@classmethod
def create_elasticsearch_mapping(cls, delete=False):
if delete:
cls.ES.indices.delete_index_if_exists("%s-index" % cls.name)
cls.ES.indices.create_index_if_missing("%s-index" % cls.name)
mapping = {
'title': {
'boost': 3.0,
'index': 'analyzed',
'store': 'no',
'type': 'string',
'analyzer': 'standard',
},
'content': {
'boost': 1.0,
'index': 'analyzed',
'store': 'no',
'type': 'string',
'analyzer': 'simple',
},
'tags': {
'boost': 2.0,
'index': 'analyzed',
'store': 'no',
'type': 'string',
'analyzer': 'standard',
},
'author': {
'boost': 1.0,
'index': 'analyzed',
'store': 'no',
'type': 'string',
'analyzer': 'simple',
},
'feed_id': {
'store': 'no',
'type': 'integer'
},
'date': {
'store': 'no',
'type': 'date',
}
}
cls.ES.indices.put_mapping("%s-type" % cls.name, {
'properties': mapping,
'_source': {'enabled': False},
}, ["%s-index" % cls.name])
@classmethod
def index(cls, story_hash, story_title, story_content, story_tags, story_author, story_feed_id,
story_date):
doc = {
"content" : story_content,
"title" : story_title,
"tags" : ', '.join(story_tags),
"author" : story_author,
"feed_id" : story_feed_id,
"date" : story_date,
}
try:
cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, story_hash)
except pyes.exceptions.NoServerAvailable:
logging.debug(" ***> ~FRNo search server available.")
@classmethod
def remove(cls, story_hash):
try:
cls.ES.delete("%s-index" % cls.name, "%s-type" % cls.name, story_hash)
except pyes.exceptions.NoServerAvailable:
logging.debug(" ***> ~FRNo search server available.")
@classmethod
def drop(cls):
cls.ES.indices.delete_index_if_exists("%s-index" % cls.name)
@classmethod
def query(cls, feed_ids, query, order, offset, limit, strip=False):
cls.create_elasticsearch_mapping()
cls.ES.indices.refresh()
if strip:
query = re.sub(r'([^\s\w_\-])+', ' ', query) # Strip non-alphanumeric
sort = "date:desc" if order == "newest" else "date:asc"
string_q = pyes.query.QueryStringQuery(query, default_operator="AND")
feed_q = pyes.query.TermsQuery('feed_id', feed_ids[:1000])
q = pyes.query.BoolQuery(must=[string_q, feed_q])
try:
results = cls.ES.search(q, indices=cls.index_name(), doc_types=[cls.type_name()],
partial_fields={}, sort=sort, start=offset, size=limit)
except pyes.exceptions.NoServerAvailable:
logging.debug(" ***> ~FRNo search server available.")
return []
logging.info(" ---> ~FG~SNSearch ~FCstories~FG for: ~SB%s~SN (across %s feed%s)" %
(query, len(feed_ids), 's' if len(feed_ids) != 1 else ''))
try:
result_ids = [r.get_id() for r in results]
except pyes.InvalidQuery as e:
logging.info(" ---> ~FRInvalid search query \"%s\": %s" % (query, e))
return []
return result_ids
@classmethod
def global_query(cls, query, order, offset, limit, strip=False):
cls.create_elasticsearch_mapping()
cls.ES.indices.refresh()
if strip:
query = re.sub(r'([^\s\w_\-])+', ' ', query) # Strip non-alphanumeric
sort = "date:desc" if order == "newest" else "date:asc"
string_q = pyes.query.QueryStringQuery(query, default_operator="AND")
try:
results = cls.ES.search(string_q, indices=cls.index_name(), doc_types=[cls.type_name()],
partial_fields={}, sort=sort, start=offset, size=limit)
except pyes.exceptions.NoServerAvailable:
logging.debug(" ***> ~FRNo search server available.")
return []
logging.info(" ---> ~FG~SNSearch ~FCstories~FG for: ~SB%s~SN (across all feeds)" %
(query))
try:
result_ids = [r.get_id() for r in results]
except pyes.InvalidQuery as e:
logging.info(" ---> ~FRInvalid search query \"%s\": %s" % (query, e))
return []
return result_ids
class SearchFeed:
_es_client = None
name = "feeds"
@classmethod
def ES(cls):
if cls._es_client is None:
cls._es_client = pyes.ES(settings.ELASTICSEARCH_FEED_HOSTS)
if not cls._es_client.indices.exists_index(cls.index_name()):
cls.create_elasticsearch_mapping()
return cls._es_client
@classmethod
def index_name(cls):
return "%s-index" % cls.name
@classmethod
def type_name(cls):
return "%s-type" % cls.name
@classmethod
def create_elasticsearch_mapping(cls, delete=False):
if delete:
cls.ES().indices.delete_index_if_exists(cls.index_name())
settings = {
"index" : {
"analysis": {
"analyzer": {
"edgengram_analyzer": {
"filter": ["edgengram"],
"tokenizer": "lowercase",
"type": "custom"
},
},
"filter": {
"edgengram": {
"max_gram": "15",
"min_gram": "1",
"type": "edgeNGram"
},
}
}
}
}
cls.ES().indices.create_index_if_missing(cls.index_name(), settings)
mapping = {
"address": {
"analyzer": "edgengram_analyzer",
"store": False,
"term_vector": "with_positions_offsets",
"type": "string"
},
"feed_id": {
"store": True,
"type": "string"
},
"num_subscribers": {
"index": "analyzed",
"store": True,
"type": "long"
},
"title": {
"analyzer": "edgengram_analyzer",
"store": False,
"term_vector": "with_positions_offsets",
"type": "string"
},
"link": {
"analyzer": "edgengram_analyzer",
"store": False,
"term_vector": "with_positions_offsets",
"type": "string"
}
}
cls.ES().indices.put_mapping(cls.type_name(), {
'properties': mapping,
}, [cls.index_name()])
cls.ES().indices.flush()
@classmethod
def index(cls, feed_id, title, address, link, num_subscribers):
doc = {
"feed_id" : feed_id,
"title" : title,
"address" : address,
"link" : link,
"num_subscribers" : num_subscribers,
}
try:
cls.ES().index(doc, cls.index_name(), cls.type_name(), feed_id)
except pyes.exceptions.NoServerAvailable:
logging.debug(" ***> ~FRNo search server available.")
@classmethod
def query(cls, text, max_subscribers=5):
try:
cls.ES().default_indices = cls.index_name()
cls.ES().indices.refresh()
except pyes.exceptions.NoServerAvailable:
logging.debug(" ***> ~FRNo search server available.")
return []
if settings.DEBUG:
max_subscribers = 1
logging.info("~FGSearch ~FCfeeds~FG: ~SB%s" % text)
q = pyes.query.BoolQuery()
q.add_should(pyes.query.MatchQuery('address', text, analyzer="simple", cutoff_frequency=0.0005, minimum_should_match="75%"))
q.add_should(pyes.query.MatchQuery('link', text, analyzer="simple", cutoff_frequency=0.0005, minimum_should_match="75%"))
q.add_should(pyes.query.MatchQuery('title', text, analyzer="simple", cutoff_frequency=0.0005, minimum_should_match="75%"))
q = pyes.Search(q, min_score=1)
results = cls.ES().search(query=q, size=max_subscribers, doc_types=[cls.type_name()], sort="num_subscribers:desc")
return results
@classmethod
def export_csv(cls):
import djqscsv
from apps.rss_feeds.models import Feed
qs = Feed.objects.filter(num_subscribers__gte=20).values('id', 'feed_title', 'feed_address', 'feed_link', 'num_subscribers')
csv = djqscsv.render_to_csv_response(qs).content
f = open('feeds.csv', 'w+')
f.write(csv)
f.close()