mirror of
https://github.com/viq/NewsBlur.git
synced 2025-09-18 21:43:31 +00:00
Merge feeds, now for MongoDB. Checks feed titles, taglines, and links, and then guarantees at least 3 stories are the same.
This commit is contained in:
parent
132c561be5
commit
3dd61cbbc6
1 changed files with 19 additions and 124 deletions
|
@ -1,12 +1,7 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
from apps.rss_feeds.models import Feed, Story, DuplicateFeed
|
||||
from apps.reader.models import UserSubscription, UserStory, UserSubscriptionFolders
|
||||
from apps.analyzer.models import FeatureCategory, Category, ClassifierTitle
|
||||
from apps.analyzer.models import ClassifierAuthor, ClassifierFeed, ClassifierTag
|
||||
from apps.rss_feeds.models import merge_feeds, MStory
|
||||
from optparse import make_option
|
||||
from django.db import connection
|
||||
from django.db.utils import IntegrityError
|
||||
from utils import json
|
||||
|
||||
class Command(BaseCommand):
|
||||
option_list = BaseCommand.option_list + (
|
||||
|
@ -18,21 +13,15 @@ class Command(BaseCommand):
|
|||
cursor = connection.cursor()
|
||||
cursor.execute("""SELECT DISTINCT f.id AS original_id, f2.id AS duplicate_id,
|
||||
f.feed_address AS original_feed_address,
|
||||
f2.feed_address AS duplicate_feed_address
|
||||
"""
|
||||
# f.feed_title AS original_feed_title,
|
||||
# f2.feed_title AS duplicate_feed_title,
|
||||
# f.feed_link AS original_feed_link,
|
||||
# f2.feed_link AS duplicate_feed_link,
|
||||
# f2.feed_tagline AS original_feed_tagline,
|
||||
# f.feed_tagline AS duplicate_feed_tagline
|
||||
"""
|
||||
FROM stories s1
|
||||
INNER JOIN stories s2 ON s1.story_guid_hash = s2.story_guid_hash
|
||||
INNER JOIN feeds f ON f.id = s1.story_feed_id
|
||||
INNER JOIN feeds f2 ON f2.id = s2.story_feed_id
|
||||
WHERE s1.story_feed_id != s2.story_feed_id
|
||||
AND f2.id > f.id
|
||||
f2.feed_address AS duplicate_feed_address,
|
||||
f.feed_title AS original_feed_title,
|
||||
f2.feed_title AS duplicate_feed_title,
|
||||
f.feed_link AS original_feed_link,
|
||||
f2.feed_link AS duplicate_feed_link,
|
||||
f2.feed_tagline AS original_feed_tagline,
|
||||
f.feed_tagline AS duplicate_feed_tagline
|
||||
FROM feeds f, feeds f2
|
||||
WHERE f2.id > f.id
|
||||
AND f.feed_tagline = f2.feed_tagline
|
||||
AND f.feed_link = f2.feed_link
|
||||
AND f.feed_title = f2.feed_title
|
||||
|
@ -41,107 +30,13 @@ class Command(BaseCommand):
|
|||
feed_fields = ('original_id', 'duplicate_id', 'original_feed_address', 'duplicate_feed_address')
|
||||
for feeds_values in cursor.fetchall():
|
||||
feeds = dict(zip(feed_fields, feeds_values))
|
||||
try:
|
||||
original_feed = Feed.objects.get(pk=feeds['original_id'])
|
||||
duplicate_feed = Feed.objects.get(pk=feeds['duplicate_id'])
|
||||
except Feed.DoesNotExist:
|
||||
print " ***> Already deleted feed: %s" % feeds['duplicate_id']
|
||||
|
||||
original_stories = MStory.objects(story_feed_id=feeds['original_id']).only('story_guid')[10:12]
|
||||
original_story_ids = [story.id for story in original_stories]
|
||||
if MStory.objects(story_feed_id=feeds['duplicate_id'], story_guid__in=original_story_ids).count() != 3:
|
||||
print "Skipping: %s" % (feeds)
|
||||
continue
|
||||
|
||||
print " ---> Feed: [%s - %s] %s - %s" % (feeds['original_id'], feeds['duplicate_id'],
|
||||
original_feed, original_feed.feed_link)
|
||||
print " --> %s" % feeds['original_feed_address']
|
||||
print " --> %s" % feeds['duplicate_feed_address']
|
||||
|
||||
user_subs = UserSubscription.objects.filter(feed=duplicate_feed)
|
||||
for user_sub in user_subs:
|
||||
# Rewrite feed in subscription folders
|
||||
try:
|
||||
user_sub_folders = UserSubscriptionFolders.objects.get(user=user_sub.user)
|
||||
except Exception, e:
|
||||
print " *** ---> UserSubscriptionFolders error: %s" % e
|
||||
continue
|
||||
folders = json.decode(user_sub_folders.folders)
|
||||
folders = self.rewrite_folders(folders, original_feed, duplicate_feed)
|
||||
user_sub_folders.folders = json.encode(folders)
|
||||
user_sub_folders.save()
|
||||
|
||||
# Switch to original feed for the user subscription
|
||||
print " ===> %s " % user_sub.user
|
||||
user_sub.feed = original_feed
|
||||
user_sub.needs_unread_recalc = True
|
||||
try:
|
||||
user_sub.save()
|
||||
pass
|
||||
except IntegrityError:
|
||||
print " !!!!> %s already subscribed" % user_sub.user
|
||||
user_sub.delete()
|
||||
|
||||
# Switch read stories
|
||||
user_stories = UserStory.objects.filter(feed=duplicate_feed)
|
||||
print " ---> %s read stories" % user_stories.count()
|
||||
for user_story in user_stories:
|
||||
user_story.feed = original_feed
|
||||
duplicate_story = user_story.story
|
||||
original_story = Story.objects.filter(story_guid_hash=duplicate_story.story_guid_hash,
|
||||
story_feed=original_feed)
|
||||
if original_story:
|
||||
user_story.story = original_story[0]
|
||||
else:
|
||||
print " ***> Can't find original story: %s" % duplicate_story
|
||||
try:
|
||||
user_story.save()
|
||||
except IntegrityError:
|
||||
print " ***> Story already saved: %s" % user_story
|
||||
|
||||
def delete_story_feed(model, feed_field='feed'):
|
||||
duplicate_stories = model.objects.filter(**{feed_field: duplicate_feed})
|
||||
# if duplicate_stories.count():
|
||||
# print " ---> Deleting %s %s" % (duplicate_stories.count(), model)
|
||||
duplicate_stories.delete()
|
||||
def switch_feed(model):
|
||||
duplicates = model.objects.filter(feed=duplicate_feed)
|
||||
if duplicates.count():
|
||||
print " ---> Switching %s %s" % (duplicates.count(), model)
|
||||
for duplicate in duplicates:
|
||||
duplicate.feed = original_feed
|
||||
try:
|
||||
duplicate.save()
|
||||
pass
|
||||
except IntegrityError:
|
||||
print " !!!!> %s already exists" % duplicate
|
||||
duplicates.delete()
|
||||
delete_story_feed(Story, 'story_feed')
|
||||
switch_feed(FeatureCategory)
|
||||
switch_feed(Category)
|
||||
switch_feed(ClassifierTitle)
|
||||
switch_feed(ClassifierAuthor)
|
||||
switch_feed(ClassifierFeed)
|
||||
switch_feed(ClassifierTag)
|
||||
|
||||
try:
|
||||
DuplicateFeed.objects.create(
|
||||
duplicate_address=duplicate_feed.feed_address,
|
||||
feed=original_feed
|
||||
)
|
||||
except IntegrityError:
|
||||
pass
|
||||
|
||||
duplicate_feed.delete()
|
||||
|
||||
def rewrite_folders(self, folders, original_feed, duplicate_feed):
|
||||
new_folders = []
|
||||
|
||||
for k, folder in enumerate(folders):
|
||||
if isinstance(folder, int):
|
||||
if folder == duplicate_feed.pk:
|
||||
# print " ===> Rewrote %s'th item: %s" % (k+1, folders)
|
||||
new_folders.append(original_feed.pk)
|
||||
else:
|
||||
new_folders.append(folder)
|
||||
elif isinstance(folder, dict):
|
||||
for f_k, f_v in folder.items():
|
||||
new_folders.append({f_k: self.rewrite_folders(f_v, original_feed, duplicate_feed)})
|
||||
|
||||
return new_folders
|
||||
|
||||
else:
|
||||
print "Merging: %s" % feeds
|
||||
# merge_feeds(feeds['original_id'], feeds['duplicate_id'])
|
||||
|
Loading…
Add table
Reference in a new issue