NewsBlur-viq/apps/analyzer/models.py

328 lines
11 KiB
Python
Raw Normal View History

2017-01-05 18:26:50 -08:00
import datetime
from collections import defaultdict
2024-04-24 09:50:42 -04:00
import mongoengine as mongo
from django.conf import settings
2009-06-16 03:08:55 +00:00
from django.contrib.auth.models import User
2017-01-05 18:26:50 -08:00
from django.core.mail import EmailMultiAlternatives
2024-04-24 09:50:42 -04:00
from django.db import models
from django.template.loader import render_to_string
2017-01-05 18:26:50 -08:00
from apps.analyzer.tasks import EmailPopularityQuery
2024-04-24 09:50:42 -04:00
from apps.rss_feeds.models import Feed
2017-01-05 18:26:50 -08:00
from utils import log as logging
2024-04-24 09:43:56 -04:00
class FeatureCategory(models.Model):
user = models.ForeignKey(User, on_delete=models.CASCADE)
feed = models.ForeignKey(Feed, on_delete=models.CASCADE)
feature = models.CharField(max_length=255)
category = models.CharField(max_length=255)
count = models.IntegerField(default=0)
2024-04-24 09:43:56 -04:00
def __str__(self):
2024-04-24 09:43:56 -04:00
return "%s - %s (%s)" % (self.feature, self.category, self.count)
class Category(models.Model):
user = models.ForeignKey(User, on_delete=models.CASCADE)
feed = models.ForeignKey(Feed, on_delete=models.CASCADE)
category = models.CharField(max_length=255)
count = models.IntegerField(default=0)
2024-04-24 09:43:56 -04:00
def __str__(self):
2024-04-24 09:43:56 -04:00
return "%s (%s)" % (self.category, self.count)
2017-01-05 18:26:50 -08:00
class MPopularityQuery(mongo.Document):
email = mongo.StringField()
query = mongo.StringField()
is_emailed = mongo.BooleanField()
creation_date = mongo.DateTimeField(default=datetime.datetime.now)
2024-04-24 09:43:56 -04:00
2017-01-05 18:26:50 -08:00
meta = {
2024-04-24 09:43:56 -04:00
"collection": "popularity_query",
"allow_inheritance": False,
2017-01-05 18:26:50 -08:00
}
2024-04-24 09:43:56 -04:00
def __str__(self):
2024-04-24 09:43:56 -04:00
return '%s - "%s"' % (self.email, self.query)
2017-01-05 18:26:50 -08:00
def queue_email(self):
2021-03-10 17:05:19 -05:00
EmailPopularityQuery.delay(pk=str(self.pk))
2024-04-24 09:43:56 -04:00
@classmethod
def ensure_all_sent(cls, queue=True):
2024-04-24 09:43:56 -04:00
for query in cls.objects.all().order_by("creation_date"):
query.ensure_sent(queue=queue)
2024-04-24 09:43:56 -04:00
def ensure_sent(self, queue=True):
if self.is_emailed:
logging.debug(" ---> Already sent %s" % self)
return
2024-04-24 09:43:56 -04:00
if queue:
self.queue_email()
else:
self.send_email()
2024-04-24 09:43:56 -04:00
2017-01-05 19:25:00 -08:00
def send_email(self, limit=5000):
2017-01-05 18:35:27 -08:00
filename = Feed.xls_query_popularity(self.query, limit=limit)
2017-01-05 18:26:50 -08:00
xlsx = open(filename, "r")
2024-04-24 09:43:56 -04:00
params = {"query": self.query}
text = render_to_string("mail/email_popularity_query.txt", params)
html = render_to_string("mail/email_popularity_query.xhtml", params)
subject = 'Keyword popularity spreadsheet: "%s"' % self.query
msg = EmailMultiAlternatives(
subject, text, from_email="NewsBlur <%s>" % settings.HELLO_EMAIL, to=["<%s>" % (self.email)]
)
2017-01-05 18:26:50 -08:00
msg.attach_alternative(html, "text/html")
2024-04-24 09:43:56 -04:00
msg.attach(filename, xlsx.read(), "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
2017-01-05 18:26:50 -08:00
msg.send()
2024-04-24 09:43:56 -04:00
self.is_emailed = True
self.save()
2024-04-24 09:43:56 -04:00
2017-01-05 18:26:50 -08:00
logging.debug(" -> ~BB~FM~SBSent email for popularity query: %s" % self)
2024-04-24 09:43:56 -04:00
2017-01-05 18:26:50 -08:00
class MClassifierTitle(mongo.Document):
user_id = mongo.IntField()
feed_id = mongo.IntField()
social_user_id = mongo.IntField()
title = mongo.StringField(max_length=255)
score = mongo.IntField()
creation_date = mongo.DateTimeField()
2024-04-24 09:43:56 -04:00
meta = {
2024-04-24 09:43:56 -04:00
"collection": "classifier_title",
"indexes": [("user_id", "feed_id"), "feed_id", ("user_id", "social_user_id"), "social_user_id"],
"allow_inheritance": False,
}
2024-04-24 09:43:56 -04:00
def __str__(self):
user = User.objects.get(pk=self.user_id)
return "%s - %s/%s: (%s) %s" % (user, self.feed_id, self.social_user_id, self.score, self.title[:30])
2024-04-24 09:43:56 -04:00
class MClassifierAuthor(mongo.Document):
2024-04-24 09:43:56 -04:00
user_id = mongo.IntField(unique_with=("feed_id", "social_user_id", "author"))
feed_id = mongo.IntField()
social_user_id = mongo.IntField()
author = mongo.StringField(max_length=255)
score = mongo.IntField()
creation_date = mongo.DateTimeField()
2024-04-24 09:43:56 -04:00
meta = {
2024-04-24 09:43:56 -04:00
"collection": "classifier_author",
"indexes": [("user_id", "feed_id"), "feed_id", ("user_id", "social_user_id"), "social_user_id"],
"allow_inheritance": False,
}
2024-04-24 09:43:56 -04:00
def __str__(self):
user = User.objects.get(pk=self.user_id)
return "%s - %s/%s: (%s) %s" % (user, self.feed_id, self.social_user_id, self.score, self.author[:30])
2024-04-24 09:43:56 -04:00
class MClassifierTag(mongo.Document):
2024-04-24 09:43:56 -04:00
user_id = mongo.IntField(unique_with=("feed_id", "social_user_id", "tag"))
feed_id = mongo.IntField()
social_user_id = mongo.IntField()
tag = mongo.StringField(max_length=255)
score = mongo.IntField()
creation_date = mongo.DateTimeField()
2024-04-24 09:43:56 -04:00
meta = {
2024-04-24 09:43:56 -04:00
"collection": "classifier_tag",
"indexes": [("user_id", "feed_id"), "feed_id", ("user_id", "social_user_id"), "social_user_id"],
"allow_inheritance": False,
}
2024-04-24 09:43:56 -04:00
def __str__(self):
user = User.objects.get(pk=self.user_id)
return "%s - %s/%s: (%s) %s" % (user, self.feed_id, self.social_user_id, self.score, self.tag[:30])
2024-04-24 09:43:56 -04:00
class MClassifierFeed(mongo.Document):
2024-04-24 09:43:56 -04:00
user_id = mongo.IntField(unique_with=("feed_id", "social_user_id"))
feed_id = mongo.IntField()
social_user_id = mongo.IntField()
score = mongo.IntField()
creation_date = mongo.DateTimeField()
2024-04-24 09:43:56 -04:00
meta = {
2024-04-24 09:43:56 -04:00
"collection": "classifier_feed",
"indexes": [("user_id", "feed_id"), "feed_id", ("user_id", "social_user_id"), "social_user_id"],
"allow_inheritance": False,
}
2024-04-24 09:43:56 -04:00
def __str__(self):
user = User.objects.get(pk=self.user_id)
if self.feed_id:
feed = Feed.get_by_id(self.feed_id)
else:
feed = User.objects.get(pk=self.social_user_id)
return "%s - %s/%s: (%s) %s" % (user, self.feed_id, self.social_user_id, self.score, feed)
2024-04-24 09:43:56 -04:00
def compute_story_score(story, classifier_titles, classifier_authors, classifier_tags, classifier_feeds):
intelligence = {
2024-04-24 09:43:56 -04:00
"feed": apply_classifier_feeds(classifier_feeds, story["story_feed_id"]),
"author": apply_classifier_authors(classifier_authors, story),
"tags": apply_classifier_tags(classifier_tags, story),
"title": apply_classifier_titles(classifier_titles, story),
}
score = 0
2024-04-24 09:43:56 -04:00
score_max = max(intelligence["title"], intelligence["author"], intelligence["tags"])
score_min = min(intelligence["title"], intelligence["author"], intelligence["tags"])
if score_max > 0:
score = score_max
elif score_min < 0:
score = score_min
if score == 0:
2024-04-24 09:43:56 -04:00
score = intelligence["feed"]
return score
2024-04-24 09:43:56 -04:00
def apply_classifier_titles(classifiers, story):
score = 0
for classifier in classifiers:
2024-04-24 09:43:56 -04:00
if classifier.feed_id != story["story_feed_id"]:
continue
2024-04-24 09:43:56 -04:00
if classifier.title.lower() in story["story_title"].lower():
# print 'Titles: (%s) %s -- %s' % (classifier.title in story['story_title'], classifier.title, story['story_title'])
score = classifier.score
2024-04-24 09:43:56 -04:00
if score > 0:
return score
return score
2024-04-24 09:43:56 -04:00
def apply_classifier_authors(classifiers, story):
score = 0
for classifier in classifiers:
2024-04-24 09:43:56 -04:00
if classifier.feed_id != story["story_feed_id"]:
continue
2024-04-24 09:43:56 -04:00
if story.get("story_authors") and classifier.author == story.get("story_authors"):
# print 'Authors: %s -- %s' % (classifier.author, story['story_authors'])
score = classifier.score
2024-04-24 09:43:56 -04:00
if score > 0:
return classifier.score
return score
2024-04-24 09:43:56 -04:00
def apply_classifier_tags(classifiers, story):
score = 0
for classifier in classifiers:
2024-04-24 09:43:56 -04:00
if classifier.feed_id != story["story_feed_id"]:
continue
2024-04-24 09:43:56 -04:00
if story["story_tags"] and classifier.tag in story["story_tags"]:
# print 'Tags: (%s-%s) %s -- %s' % (classifier.tag in story['story_tags'], classifier.score, classifier.tag, story['story_tags'])
score = classifier.score
2024-04-24 09:43:56 -04:00
if score > 0:
return classifier.score
return score
2024-04-24 09:43:56 -04:00
def apply_classifier_feeds(classifiers, feed, social_user_ids=None):
2024-04-24 09:43:56 -04:00
if not feed and not social_user_ids:
return 0
feed_id = None
if feed:
feed_id = feed if isinstance(feed, int) else feed.pk
2024-04-24 09:43:56 -04:00
if social_user_ids and not isinstance(social_user_ids, list):
social_user_ids = [social_user_ids]
2024-04-24 09:43:56 -04:00
for classifier in classifiers:
if classifier.feed_id == feed_id:
# print 'Feeds: %s -- %s' % (classifier.feed_id, feed.pk)
return classifier.score
2024-04-24 09:43:56 -04:00
if social_user_ids and not classifier.feed_id and classifier.social_user_id in social_user_ids:
return classifier.score
return 0
2024-04-24 09:43:56 -04:00
def get_classifiers_for_user(
user,
feed_id=None,
social_user_id=None,
classifier_feeds=None,
classifier_authors=None,
classifier_titles=None,
classifier_tags=None,
):
params = dict(user_id=user.pk)
2012-05-26 22:14:34 -07:00
if isinstance(feed_id, list):
2024-04-24 09:43:56 -04:00
params["feed_id__in"] = feed_id
2012-05-26 22:14:34 -07:00
elif feed_id:
2024-04-24 09:43:56 -04:00
params["feed_id"] = feed_id
if social_user_id:
2021-06-14 16:20:13 -04:00
if isinstance(social_user_id, str):
2024-04-24 09:43:56 -04:00
social_user_id = int(social_user_id.replace("social:", ""))
params["social_user_id"] = social_user_id
2012-05-26 22:14:34 -07:00
if classifier_authors is None:
classifier_authors = list(MClassifierAuthor.objects(**params))
if classifier_titles is None:
classifier_titles = list(MClassifierTitle.objects(**params))
if classifier_tags is None:
classifier_tags = list(MClassifierTag.objects(**params))
2012-05-26 22:14:34 -07:00
if classifier_feeds is None:
if not social_user_id and feed_id:
2024-04-24 09:43:56 -04:00
params["social_user_id"] = 0
2012-05-26 22:14:34 -07:00
classifier_feeds = list(MClassifierFeed.objects(**params))
2024-04-24 09:43:56 -04:00
feeds = []
for f in classifier_feeds:
if f.social_user_id and not f.feed_id:
2024-04-24 09:43:56 -04:00
feeds.append(("social:%s" % f.social_user_id, f.score))
else:
feeds.append((f.feed_id, f.score))
2024-04-24 09:43:56 -04:00
2010-03-23 20:03:40 -04:00
payload = {
2024-04-24 09:43:56 -04:00
"feeds": dict(feeds),
"authors": dict([(a.author, a.score) for a in classifier_authors]),
"titles": dict([(t.title, t.score) for t in classifier_titles]),
"tags": dict([(t.tag, t.score) for t in classifier_tags]),
2010-03-23 20:03:40 -04:00
}
2024-04-24 09:43:56 -04:00
return payload
2024-04-24 09:43:56 -04:00
def sort_classifiers_by_feed(
user,
feed_ids=None,
classifier_feeds=None,
classifier_authors=None,
classifier_titles=None,
classifier_tags=None,
):
def sort_by_feed(classifiers):
feed_classifiers = defaultdict(list)
for classifier in classifiers:
feed_classifiers[classifier.feed_id].append(classifier)
return feed_classifiers
2024-04-24 09:43:56 -04:00
classifiers = {}
if feed_ids:
2024-04-24 09:43:56 -04:00
classifier_feeds = sort_by_feed(classifier_feeds)
classifier_authors = sort_by_feed(classifier_authors)
2024-04-24 09:43:56 -04:00
classifier_titles = sort_by_feed(classifier_titles)
classifier_tags = sort_by_feed(classifier_tags)
for feed_id in feed_ids:
2024-04-24 09:43:56 -04:00
classifiers[feed_id] = get_classifiers_for_user(
user,
feed_id=feed_id,
classifier_feeds=classifier_feeds[feed_id],
classifier_authors=classifier_authors[feed_id],
classifier_titles=classifier_titles[feed_id],
classifier_tags=classifier_tags[feed_id],
)
2017-01-05 18:26:50 -08:00
return classifiers