NewsBlur/apps/feed_import/models.py

from collections import defaultdict
from django.db import models
from django.contrib.auth.models import User
from apps.rss_feeds.models import Feed, DuplicateFeed
from apps.reader.models import UserSubscription, UserSubscriptionFolders
from apps.rss_feeds.tasks import NewFeeds
from celery.task import Task
import datetime
import lxml.etree
from utils import json_functions as json, urlnorm
import utils.opml as opml
from utils import log as logging

class OAuthToken(models.Model):
    user = models.OneToOneField(User, null=True, blank=True)
    session_id = models.CharField(max_length=50, null=True, blank=True)
    remote_ip = models.CharField(max_length=50, null=True, blank=True)
    request_token = models.CharField(max_length=50)
    request_token_secret = models.CharField(max_length=50)
    access_token = models.CharField(max_length=50)
    access_token_secret = models.CharField(max_length=50)
    created_date = models.DateTimeField(default=datetime.datetime.now)
    

class Importer:

    def clear_feeds(self):
        UserSubscriptionFolders.objects.filter(user=self.user).delete()
        UserSubscription.objects.filter(user=self.user).delete()

    
class OPMLImporter(Importer):
    
    def __init__(self, opml_xml, user):
        self.user = user
        self.opml_xml = opml_xml

    def process(self):
        outline = opml.from_string(self.opml_xml)
        self.clear_feeds()
        folders = self.process_outline(outline)
        UserSubscriptionFolders.objects.create(user=self.user, folders=json.encode(folders))
        return folders
        
    def process_outline(self, outline):
        folders = []
    
        for item in outline:
            if not hasattr(item, 'xmlUrl'):
                folder = item
                if hasattr(folder, 'text'):
                    logging.info(' ---> [%s] ~FRNew Folder: %s' % (self.user, folder.text))
                folders.append({folder.text: self.process_outline(folder)})
            elif hasattr(item, 'xmlUrl'):
                feed = item
                if not hasattr(feed, 'htmlUrl'):
                    setattr(feed, 'htmlUrl', None)
                if not hasattr(feed, 'title'):
                    setattr(feed, 'title', feed.htmlUrl)
                feed_address = urlnorm.normalize(feed.xmlUrl)
                feed_link = urlnorm.normalize(feed.htmlUrl)
                if len(feed_address) > Feed._meta.get_field('feed_address').max_length:
                    continue
                if feed_link and len(feed_link) > Feed._meta.get_field('feed_link').max_length:
                    continue
                if feed.title and len(feed.title) > Feed._meta.get_field('feed_title').max_length:
                    feed.title = feed.title[:255]
                logging.info(' ---> \t~FR%s - %s - %s' % (feed.title, feed_link, feed_address,))
                feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title)
                # feeds.append(feed_data)

                # See if it exists as a duplicate first
                duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
                if duplicate_feed:
                    feed_db = duplicate_feed[0].feed
                else:
                    feed_data['active_subscribers'] = 1
                    feed_data['num_subscribers'] = 1
                    feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address,
                                                            defaults=dict(**feed_data))
                    
                us, _ = UserSubscription.objects.get_or_create(
                    feed=feed_db, 
                    user=self.user,
                    defaults={
                        'needs_unread_recalc': True,
                        'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1),
                        'active': self.user.profile.is_premium,
                    }
                )
                folders.append(feed_db.pk)
        return folders
        

class GoogleReaderImporter(Importer):
    
    def __init__(self, feeds_xml, user):
        self.user = user
        self.feeds_xml = feeds_xml
        self.subscription_folders = []
        
    def process(self):
        self.clear_feeds()
        self.parse()

        folders = defaultdict(list)
        for item in self.feeds:
            folders = self.process_item(item, folders)
        # print dict(folders)
        self.rearrange_folders(folders)
        logging.info(" ---> [%s] ~BC~FW~SBGoogle Reader import: ~BT~FW%s" % (self.user, self.subscription_folders))
        UserSubscriptionFolders.objects.get_or_create(user=self.user, defaults=dict(
                                                      folders=json.encode(self.subscription_folders)))


    def parse(self):
        self.feeds = lxml.etree.fromstring(self.feeds_xml).xpath('/object/list/object')
    
    def process_item(self, item, folders):
        feed_title = item.xpath('./string[@name="title"]') and \
                        item.xpath('./string[@name="title"]')[0].text
        feed_address = item.xpath('./string[@name="id"]') and \
                        item.xpath('./string[@name="id"]')[0].text.replace('feed/', '')
        feed_link = item.xpath('./string[@name="htmlUrl"]') and \
                        item.xpath('./string[@name="htmlUrl"]')[0].text
        category = item.xpath('./list[@name="categories"]/object/string[@name="label"]') and \
                        item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text
        
        if not feed_address:
            feed_address = feed_link
        
        try:
            feed_link = urlnorm.normalize(feed_link)
            feed_address = urlnorm.normalize(feed_address)

            if len(feed_address) > Feed._meta.get_field('feed_address').max_length:
                return folders

            # See if it exists as a duplicate first
            duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)
            if duplicate_feed:
                feed_db = duplicate_feed[0].feed
            else:
                feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed_title)
                feed_data['active_subscribers'] = 1
                feed_data['num_subscribers'] = 1
                feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address,
                                                        defaults=dict(**feed_data))

            us, _ = UserSubscription.objects.get_or_create(
                feed=feed_db, 
                user=self.user,
                defaults={
                    'needs_unread_recalc': True,
                    'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1),
                    'active': self.user.profile.is_premium,
                }
            )
            if not category: category = "Root"
            folders[category].append(feed_db.pk)
        except Exception, e:
            logging.info(' *** -> Exception: %s' % e)
            
        return folders
        
    def rearrange_folders(self, folders, depth=0):
        for folder, items in folders.items():
            if folder == 'Root':
                self.subscription_folders += items
            else:
                # folder_parents = folder.split(u' \u2014 ')
                self.subscription_folders.append({folder: items})
     
def queue_new_feeds(user):
    new_feeds = UserSubscription.objects.filter(user=user, 
                                                feed__fetched_once=False, 
                                                active=True).values('feed_id')
    new_feeds = list(set([f['feed_id'] for f in new_feeds]))
    logging.info(" ---> [%s] ~BC~FW~SBQueueing NewFeeds: ~FC(%s) %s" % (user, len(new_feeds), new_feeds))
    size = 4
    publisher = Task.get_publisher(exchange="new_feeds")
    for t in (new_feeds[pos:pos + size] for pos in xrange(0, len(new_feeds), size)):
        NewFeeds.apply_async(args=(t,), queue="new_feeds", publisher=publisher)
    publisher.connection.close()
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`from collections import defaultdict`
			`from django.db import models`
			`from django.contrib.auth.models import User`
Adding a duplicate feed table so future duplicates will be caught and corrected. 2010-08-19 10:43:07 -04:00			`from apps.rss_feeds.models import Feed, DuplicateFeed`
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`from apps.reader.models import UserSubscription, UserSubscriptionFolders`
Adding a new queue that handles new feeds, so they can be prioritized over updating old feeds. 2010-09-20 19:22:19 -04:00			`from apps.rss_feeds.tasks import NewFeeds`
Fixing celery queues and exchanges by adding explicit bindings for the new and update queues. 2010-09-28 19:50:29 -04:00			`from celery.task import Task`
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`import datetime`
			`import lxml.etree`
Renaming json.py to json_functions.py to resolve namespace conflict with anyjson. 2010-10-23 13:06:28 -04:00			`from utils import json_functions as json, urlnorm`
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`import utils.opml as opml`
Adding logging in many other places. 2010-08-16 15:45:35 -04:00			`from utils import log as logging`
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00
			`class OAuthToken(models.Model):`
Google Reader import now allows you to create an account after you import your feeds. Also highlighted on front page. 2010-07-05 17:11:04 -04:00			`user = models.OneToOneField(User, null=True, blank=True)`
			`session_id = models.CharField(max_length=50, null=True, blank=True)`
Adding remote_ip for OAuth imports, since too often the session_id changes from authorization to verification. I need a better way to store this information, preferably a cookie that is constant. 2010-07-17 11:50:16 -04:00			`remote_ip = models.CharField(max_length=50, null=True, blank=True)`
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`request_token = models.CharField(max_length=50)`
			`request_token_secret = models.CharField(max_length=50)`
			`access_token = models.CharField(max_length=50)`
			`access_token_secret = models.CharField(max_length=50)`
Adding remote_ip for OAuth imports, since too often the session_id changes from authorization to verification. I need a better way to store this information, preferably a cookie that is constant. 2010-07-17 11:50:16 -04:00			`created_date = models.DateTimeField(default=datetime.datetime.now)`
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00

			`class Importer:`

			`def clear_feeds(self):`
			`UserSubscriptionFolders.objects.filter(user=self.user).delete()`
			`UserSubscription.objects.filter(user=self.user).delete()`
Only queueing new feeds on activation. 2010-10-26 14:04:26 -04:00
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00
			`class OPMLImporter(Importer):`

			`def __init__(self, opml_xml, user):`
			`self.user = user`
			`self.opml_xml = opml_xml`

			`def process(self):`
			`outline = opml.from_string(self.opml_xml)`
			`self.clear_feeds()`
			`folders = self.process_outline(outline)`
			`UserSubscriptionFolders.objects.create(user=self.user, folders=json.encode(folders))`
			`return folders`

			`def process_outline(self, outline):`
			`folders = []`

			`for item in outline:`
			`if not hasattr(item, 'xmlUrl'):`
			`folder = item`
Quick changes to fix issues with new users and their newly subscribed feeds. 2010-09-23 16:33:06 -04:00			`if hasattr(folder, 'text'):`
A whole bunch of colored logs. 2010-12-05 23:20:32 -05:00			`logging.info(' ---> [%s] ~FRNew Folder: %s' % (self.user, folder.text))`
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`folders.append({folder.text: self.process_outline(folder)})`
			`elif hasattr(item, 'xmlUrl'):`
			`feed = item`
			`if not hasattr(feed, 'htmlUrl'):`
			`setattr(feed, 'htmlUrl', None)`
			`if not hasattr(feed, 'title'):`
			`setattr(feed, 'title', feed.htmlUrl)`
Adding URL normalization on new feeds. Also adding in collocation search backend. This is a good example of poor committing. 2010-07-27 22:11:23 -04:00			`feed_address = urlnorm.normalize(feed.xmlUrl)`
			`feed_link = urlnorm.normalize(feed.htmlUrl)`
Myriad number of bug fixes to reduce the number of exception e-mails I get. Forbidden JSON requests now send 403s. Adding a URl no longer uses a partial URL saearch on feed_link (which would link usetoday.com -> usatoday.com/sports.xml). Also adding the model for FeedPage for mongo. 2010-08-27 18:35:33 -04:00			`if len(feed_address) > Feed._meta.get_field('feed_address').max_length:`
			`continue`
Sanity checks for OPML imports. 2010-09-06 11:23:44 -07:00			`if feed_link and len(feed_link) > Feed._meta.get_field('feed_link').max_length:`
Myriad number of bug fixes to reduce the number of exception e-mails I get. Forbidden JSON requests now send 403s. Adding a URl no longer uses a partial URL saearch on feed_link (which would link usetoday.com -> usatoday.com/sports.xml). Also adding the model for FeedPage for mongo. 2010-08-27 18:35:33 -04:00			`continue`
Sanity checks for OPML imports. 2010-09-06 11:23:44 -07:00			`if feed.title and len(feed.title) > Feed._meta.get_field('feed_title').max_length:`
Myriad number of bug fixes to reduce the number of exception e-mails I get. Forbidden JSON requests now send 403s. Adding a URl no longer uses a partial URL saearch on feed_link (which would link usetoday.com -> usatoday.com/sports.xml). Also adding the model for FeedPage for mongo. 2010-08-27 18:35:33 -04:00			`feed.title = feed.title[:255]`
A whole bunch of colored logs. 2010-12-05 23:20:32 -05:00			`logging.info(' ---> \t~FR%s - %s - %s' % (feed.title, feed_link, feed_address,))`
Adding URL normalization on new feeds. Also adding in collocation search backend. This is a good example of poor committing. 2010-07-27 22:11:23 -04:00			`feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title)`
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`# feeds.append(feed_data)`
Adding a duplicate feed table so future duplicates will be caught and corrected. 2010-08-19 10:43:07 -04:00
			`# See if it exists as a duplicate first`
			`duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)`
			`if duplicate_feed:`
			`feed_db = duplicate_feed[0].feed`
			`else:`
Quick changes to fix issues with new users and their newly subscribed feeds. 2010-09-23 16:33:06 -04:00			`feed_data['active_subscribers'] = 1`
			`feed_data['num_subscribers'] = 1`
Adding a duplicate feed table so future duplicates will be caught and corrected. 2010-08-19 10:43:07 -04:00			`feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address,`
			`defaults=dict(**feed_data))`

Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`us, _ = UserSubscription.objects.get_or_create(`
			`feed=feed_db,`
			`user=self.user,`
			`defaults={`
			`'needs_unread_recalc': True,`
If the user is already premium, all of their feeds are active. 2010-10-29 17:17:36 -04:00			`'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1),`
			`'active': self.user.profile.is_premium,`
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`}`
			`)`
			`folders.append(feed_db.pk)`
			`return folders`


			`class GoogleReaderImporter(Importer):`

			`def __init__(self, feeds_xml, user):`
			`self.user = user`
			`self.feeds_xml = feeds_xml`
			`self.subscription_folders = []`

			`def process(self):`
			`self.clear_feeds()`
			`self.parse()`

			`folders = defaultdict(list)`
			`for item in self.feeds:`
			`folders = self.process_item(item, folders)`
Google Reader import now allows you to create an account after you import your feeds. Also highlighted on front page. 2010-07-05 17:11:04 -04:00			`# print dict(folders)`
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`self.rearrange_folders(folders)`
A whole bunch of colored logs. 2010-12-05 23:50:48 -05:00			`logging.info(" ---> [%s] ~BC~FW~SBGoogle Reader import: ~BT~FW%s" % (self.user, self.subscription_folders))`
Minor changes: feed trainer only uses active feeds. Google Reader import being called multiple times is now handled gracefully. 2010-11-08 18:26:36 -05:00			`UserSubscriptionFolders.objects.get_or_create(user=self.user, defaults=dict(`
			`folders=json.encode(self.subscription_folders)))`

Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00
			`def parse(self):`
			`self.feeds = lxml.etree.fromstring(self.feeds_xml).xpath('/object/list/object')`

			`def process_item(self, item, folders):`
			`feed_title = item.xpath('./string[@name="title"]') and \`
			`item.xpath('./string[@name="title"]')[0].text`
			`feed_address = item.xpath('./string[@name="id"]') and \`
			`item.xpath('./string[@name="id"]')[0].text.replace('feed/', '')`
			`feed_link = item.xpath('./string[@name="htmlUrl"]') and \`
			`item.xpath('./string[@name="htmlUrl"]')[0].text`
			`category = item.xpath('./list[@name="categories"]/object/string[@name="label"]') and \`
			`item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text`

Fixing bad page URLs to use the feed fetcher to find the correct address. 2010-07-21 11:38:33 -04:00			`if not feed_address:`
			`feed_address = feed_link`
Adding URL normalization on new feeds. Also adding in collocation search backend. This is a good example of poor committing. 2010-07-27 22:11:23 -04:00
Catching errors in Reader import. 2010-08-11 11:05:46 -04:00			`try:`
Unicode crap in Google Reader's urls. Ignoring for certain feeds. 2010-08-11 17:44:31 -04:00			`feed_link = urlnorm.normalize(feed_link)`
			`feed_address = urlnorm.normalize(feed_address)`

Myriad number of bug fixes to reduce the number of exception e-mails I get. Forbidden JSON requests now send 403s. Adding a URl no longer uses a partial URL saearch on feed_link (which would link usetoday.com -> usatoday.com/sports.xml). Also adding the model for FeedPage for mongo. 2010-08-27 18:35:33 -04:00			`if len(feed_address) > Feed._meta.get_field('feed_address').max_length:`
New informational message. 2010-08-25 14:54:28 -04:00			`return folders`

Adding a duplicate feed table so future duplicates will be caught and corrected. 2010-08-19 10:43:07 -04:00			`# See if it exists as a duplicate first`
			`duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address)`
			`if duplicate_feed:`
			`feed_db = duplicate_feed[0].feed`
			`else:`
			`feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed_title)`
Quick changes to fix issues with new users and their newly subscribed feeds. 2010-09-23 16:33:06 -04:00			`feed_data['active_subscribers'] = 1`
			`feed_data['num_subscribers'] = 1`
Adding a duplicate feed table so future duplicates will be caught and corrected. 2010-08-19 10:43:07 -04:00			`feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address,`
			`defaults=dict(**feed_data))`

Catching errors in Reader import. 2010-08-11 11:05:46 -04:00			`us, _ = UserSubscription.objects.get_or_create(`
			`feed=feed_db,`
			`user=self.user,`
			`defaults={`
			`'needs_unread_recalc': True,`
If the user is already premium, all of their feeds are active. 2010-10-29 17:17:36 -04:00			`'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1),`
			`'active': self.user.profile.is_premium,`
Catching errors in Reader import. 2010-08-11 11:05:46 -04:00			`}`
			`)`
			`if not category: category = "Root"`
			`folders[category].append(feed_db.pk)`
			`except Exception, e:`
Code refactors. Just changing styles. 2010-08-23 16:23:16 -04:00			`logging.info(' *** -> Exception: %s' % e)`
Catching errors in Reader import. 2010-08-11 11:05:46 -04:00
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`return folders`

			`def rearrange_folders(self, folders, depth=0):`
			`for folder, items in folders.items():`
			`if folder == 'Root':`
			`self.subscription_folders += items`
			`else:`
Adding a duplicate feed table so future duplicates will be caught and corrected. 2010-08-19 10:43:07 -04:00			`# folder_parents = folder.split(u' \u2014 ')`
Finishing off Google Reader OAuth synchronization. Also renamed opml_import -> feed_import. And fixed the height of small modals. 2010-06-29 20:16:09 -04:00			`self.subscription_folders.append({folder: items})`
Only queueing new feeds on activation. 2010-10-26 14:04:26 -04:00
			`def queue_new_feeds(user):`
			`new_feeds = UserSubscription.objects.filter(user=user,`
			`feed__fetched_once=False,`
			`active=True).values('feed_id')`
			`new_feeds = list(set([f['feed_id'] for f in new_feeds]))`
A whole bunch of colored logs. 2010-12-05 23:20:32 -05:00			`logging.info(" ---> [%s] ~BC~FW~SBQueueing NewFeeds: ~FC(%s) %s" % (user, len(new_feeds), new_feeds))`
Only queueing new feeds on activation. 2010-10-26 14:04:26 -04:00			`size = 4`
			`publisher = Task.get_publisher(exchange="new_feeds")`
			`for t in (new_feeds[pos:pos + size] for pos in xrange(0, len(new_feeds), size)):`
			`NewFeeds.apply_async(args=(t,), queue="new_feeds", publisher=publisher)`
			`publisher.connection.close()`