diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index f6586bbf4..a4e478ca7 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -410,6 +410,8 @@ class Feed(models.Model): return cls.objects.get(feed_address=url) if url and re.match('(https?://)?twitter.com/\w+/?$', url): without_rss = True + if url and re.match(r'(https?://)?(www\.)?facebook.com/\w+/?$', url): + without_rss = True if url and 'youtube.com/user/' in url: username = re.search('youtube.com/user/(\w+)', url).group(1) url = "http://gdata.youtube.com/feeds/base/users/%s/uploads" % username diff --git a/apps/social/models.py b/apps/social/models.py index 72be26611..174744aea 100644 --- a/apps/social/models.py +++ b/apps/social/models.py @@ -2581,10 +2581,10 @@ class MSocialServices(mongo.Document): self.syncing_facebook = False self.save() - facebook_user = graph.request('me', args={'fields':'website,bio,location'}) + facebook_user = graph.request('me', args={'fields':'website,about,location'}) profile = MSocialProfile.get_user(self.user_id) profile.location = profile.location or (facebook_user.get('location') and facebook_user['location']['name']) - profile.bio = profile.bio or facebook_user.get('bio') + profile.bio = profile.bio or facebook_user.get('about') if not profile.website and facebook_user.get('website'): profile.website = facebook_user.get('website').split()[0] profile.save() diff --git a/utils/facebook_fetcher.py b/utils/facebook_fetcher.py index b4c5a4e6a..550ae881d 100644 --- a/utils/facebook_fetcher.py +++ b/utils/facebook_fetcher.py @@ -1,11 +1,9 @@ import re import datetime -import tweepy import dateutil.parser from django.conf import settings from django.utils import feedgenerator from django.utils.html import linebreaks -from django.utils.dateformat import DateFormat from apps.social.models import MSocialServices from apps.reader.models import UserSubscription from utils import log as logging @@ -21,57 +19,77 @@ class FacebookFetcher: address = self.feed.feed_address self.address = address - page = self.extract_page() - if not page: + page_name = self.extract_page_name() + if not page_name: return - facebook_user = self.fetch_user(username) + facebook_user = self.facebook_user() if not facebook_user: return # If 'video', use video API to get embed: # f.get_object('tastyvegetarian', fields='posts') # f.get_object('1992797300790726', fields='embed_html') - stories = self.page_feed(page, facebook_user) + feed = self.fetch_page_feed(facebook_user, page_name, 'name,about,posts,videos,photos') data = {} - data['title'] = "%s on Facebook" % page - data['link'] = "https://facebook.com/%s" % page - data['description'] = "%s on Facebook" % page + data['title'] = feed.get('name', "%s on Facebook" % page_name) + data['link'] = feed.get('link', "https://facebook.com/%s" % page_name) + data['description'] = feed.get('about', "%s on Facebook" % page_name) data['lastBuildDate'] = datetime.datetime.utcnow() data['generator'] = 'NewsBlur Facebook API Decrapifier - %s' % settings.NEWSBLUR_URL data['docs'] = None data['feed_url'] = address rss = feedgenerator.Atom1Feed(**data) + merged_data = [] - for story in stories: - story_data = self.page_feed_story(story.__dict__) + posts = feed.get('posts', {}).get('data', None) + if posts: + for post in posts: + story_data = self.page_posts_story(facebook_user, post) + if not story_data: + continue + merged_data.append(story_data) + + videos = feed.get('videos', {}).get('data', None) + for video in videos: + story_data = self.page_video_story(facebook_user, video) + if not story_data: + continue + for seen_data in merged_data: + if story_data['link'] == seen_data['link']: + if len(story_data['description']) > len(seen_data['description']): + seen_data['description'] = story_data['description'] + seen_data['title'] = story_data['title'] + break + + for story_data in merged_data: rss.add_item(**story_data) return rss.writeString('utf-8') - def extract_page(self): + def extract_page_name(self): page = None try: - username_groups = re.search('twitter.com/(\w+)/?', self.address) - if not username_groups: + page_groups = re.search('facebook.com/(\w+)/?', self.address) + if not page_groups: return - username = username_groups.group(1) + page = page_groups.group(1) except IndexError: return - return username + return page - def fetch_user(self, username): - twitter_api = None + def facebook_user(self): + facebook_api = None social_services = None + if self.options.get('requesting_user_id', None): social_services = MSocialServices.get_user(self.options.get('requesting_user_id')) - try: - twitter_api = social_services.twitter_api() - except tweepy.error.TweepError, e: - logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s: %s' % - (self.feed.log_title[:30], self.address, e)) + facebook_api = social_services.facebook_api() + if not facebook_api: + logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' % + (self.feed.log_title[:30], self.address, self.options)) return else: usersubs = UserSubscription.objects.filter(feed=self.feed) @@ -79,210 +97,91 @@ class FacebookFetcher: logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s: No subscriptions' % (self.feed.log_title[:30], self.address)) return + for sub in usersubs: social_services = MSocialServices.get_user(sub.user_id) - if not social_services.twitter_uid: continue - try: - twitter_api = social_services.twitter_api() - if not twitter_api: - continue - else: - break - except tweepy.error.TweepError, e: - logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s: %s' % - (self.feed.log_title[:30], self.address, e)) + if not social_services.facebook_uid: continue + + facebook_api = social_services.facebook_api() + if not facebook_api: + continue + else: + break - if not twitter_api: - logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s: No twitter API for %s' % - (self.feed.log_title[:30], self.address, usersubs[0].user.username)) - return + if not facebook_api: + logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' % + (self.feed.log_title[:30], self.address, usersubs[0].user.username)) + return - try: - twitter_user = twitter_api.get_user(username) - except TypeError, e: - logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed, disconnecting twitter: %s: %s' % - (self.feed.log_title[:30], self.address, e)) - self.feed.save_feed_history(560, "Twitter Error: %s" % (e)) - return - except tweepy.error.TweepError, e: - message = str(e).lower() - if ((len(e.args) >= 2 and e.args[2] == 63) or - ('temporarily locked' in message)): - # Suspended - logging.debug(u' ***> [%-30s] ~FRTwitter failed, user suspended, disconnecting twitter: %s: %s' % - (self.feed.log_title[:30], self.address, e)) - self.feed.save_feed_history(560, "Twitter Error: User suspended") - return - elif 'suspended' in message: - logging.debug(u' ***> [%-30s] ~FRTwitter user suspended, disconnecting twitter: %s: %s' % - (self.feed.log_title[:30], self.address, e)) - self.feed.save_feed_history(560, "Twitter Error: User suspended") - return - elif 'expired token' in message: - logging.debug(u' ***> [%-30s] ~FRTwitter user expired, disconnecting twitter: %s: %s' % - (self.feed.log_title[:30], self.address, e)) - self.feed.save_feed_history(560, "Twitter Error: Expired token") - social_services.disconnect_twitter() - return - elif 'not found' in message: - logging.debug(u' ***> [%-30s] ~FRTwitter user not found, disconnecting twitter: %s: %s' % - (self.feed.log_title[:30], self.address, e)) - self.feed.save_feed_history(560, "Twitter Error: User not found") - return - elif 'over capacity' in message: - logging.debug(u' ***> [%-30s] ~FRTwitter over capacity, ignoring... %s: %s' % - (self.feed.log_title[:30], self.address, e)) - self.feed.save_feed_history(460, "Twitter Error: Over capacity") - return - else: - raise e - - return twitter_user + return facebook_api - def page_feed(self, facebook_user, page): - try: - stories = facebook_user.get_object(page, fields='feed') - except Exception, e: - message = str(e).lower() - if 'not authorized' in message: - logging.debug(u' ***> [%-30s] ~FRTwitter timeline failed, disconnecting twitter: %s: %s' % - (self.feed.log_title[:30], self.address, e)) - self.feed.save_feed_history(560, "Twitter Error: Not authorized") - return [] - elif 'user not found' in message: - logging.debug(u' ***> [%-30s] ~FRTwitter user not found, disconnecting twitter: %s: %s' % - (self.feed.log_title[:30], self.address, e)) - self.feed.save_feed_history(560, "Twitter Error: User not found") - return [] - elif 'blocked from viewing' in message: - logging.debug(u' ***> [%-30s] ~FRTwitter user blocked, ignoring: %s' % - (self.feed.log_title[:30], e)) - self.feed.save_feed_history(560, "Twitter Error: Blocked from viewing") - return [] - else: - raise e + def fetch_page_feed(self, facebook_user, page, fields): + stories = facebook_user.get_object(page, fields=fields) if not stories: return [] - return stories - - def page_feed_story(self, story): - categories = set() - - if user_tweet['full_text'].startswith('RT @'): - categories.add('retweet') - elif user_tweet['in_reply_to_status_id'] or user_tweet['full_text'].startswith('@'): - categories.add('reply') - else: - categories.add('tweet') - if user_tweet['full_text'].startswith('RT @'): - categories.add('retweet') - if user_tweet['favorite_count']: - categories.add('liked') - if user_tweet['retweet_count']: - categories.add('retweeted') - if 'http' in user_tweet['full_text']: - categories.add('link') - - story = {} - content_tweet = user_tweet - entities = "" - author = user_tweet.get('author') or user_tweet.get('user') - if not isinstance(author, dict): author = author.__dict__ - author_name = author['screen_name'] - original_author_name = author_name - if user_tweet['in_reply_to_user_id'] == author['id']: - categories.add('reply-to-self') - retweet_author = "" - if 'retweeted_status' in user_tweet: - retweet_author = """Retweeted by - - %s - on %s""" % ( - author_name, - author['profile_image_url_https'], - author_name, - author_name, - DateFormat(user_tweet['created_at']).format('l, F jS, Y g:ia').replace('.',''), - ) - content_tweet = user_tweet['retweeted_status'].__dict__ - author = content_tweet['author'] - if not isinstance(author, dict): author = author.__dict__ - author_name = author['screen_name'] - - tweet_title = user_tweet['full_text'] - tweet_text = linebreaks(content_tweet['full_text']) - replaced = {} - entities_media = content_tweet['entities'].get('media', []) - if 'extended_entities' in content_tweet: - entities_media = content_tweet['extended_entities'].get('media', []) - for media in entities_media: - if 'media_url_https' not in media: continue - if media['type'] == 'photo': - if media.get('url') and media['url'] in tweet_text: - tweet_title = tweet_title.replace(media['url'], media['display_url']) - replacement = "%s" % (media['expanded_url'], media['display_url']) - if not replaced.get(media['url']): - tweet_text = tweet_text.replace(media['url'], replacement) - replaced[media['url']] = True - entities += "
" % media['media_url_https'] - if 'photo' not in categories: - categories.add('photo') - for url in content_tweet['entities'].get('urls', []): - if url['url'] in tweet_text: - replacement = "%s" % (url['expanded_url'], url['display_url']) - if not replaced.get(url['url']): - tweet_text = tweet_text.replace(url['url'], replacement) - replaced[url['url']] = True - tweet_title = tweet_title.replace(url['url'], url['display_url']) - - quote_tweet_content = "" - if 'quoted_status' in content_tweet: - quote_tweet_content = "
"+self.tweet_story(content_tweet['quoted_status'])['description']+"
" - - - created_date = content_tweet['created_at'] + return stories + + def page_posts_story(self, facebook_user, page_story): + categories = set() + if 'message' not in page_story: + # Probably a story shared on the page's timeline, not a published story + return + message = linebreaks(page_story['message']) + created_date = page_story['created_time'] if isinstance(created_date, unicode): created_date = dateutil.parser.parse(created_date) - - content = """
-
%s
-
%s
-
-
%s
-
- Posted by - - %s - on %s
-
%s
-
%s %s%s %s
+ permalink = facebook_user.get_object(page_story['id'], fields='permalink_url')['permalink_url'] + content = """
+
%s
""" % ( - tweet_text, - quote_tweet_content, - entities, - author_name, - author['profile_image_url_https'], - author_name, - author_name, - DateFormat(created_date).format('l, F jS, Y g:ia').replace('.',''), - retweet_author, - ("

" if content_tweet['favorite_count'] or content_tweet['retweet_count'] else ""), - ("%s %s" % (content_tweet['favorite_count'], "like" if content_tweet['favorite_count'] == 1 else "likes")) if content_tweet['favorite_count'] else "", - (", " if content_tweet['favorite_count'] and content_tweet['retweet_count'] else ""), - ("%s %s" % (content_tweet['retweet_count'], "retweet" if content_tweet['retweet_count'] == 1 else "retweets")) if content_tweet['retweet_count'] else "", + message ) story = { - 'title': tweet_title, - 'link': "https://twitter.com/%s/status/%s" % (original_author_name, user_tweet['id']), + 'title': message, + 'link': permalink, 'description': content, - 'author_name': author_name, 'categories': list(categories), - 'unique_id': "tweet:%s" % user_tweet['id'], - 'pubdate': user_tweet['created_at'], + 'unique_id': "fb_post:%s" % page_story['id'], + 'pubdate': created_date, } return story + + def page_video_story(self, facebook_user, page_story): + categories = set() + if 'description' not in page_story: + return + message = linebreaks(page_story['description']) + created_date = page_story['updated_time'] + if isinstance(created_date, unicode): + created_date = dateutil.parser.parse(created_date) + permalink = facebook_user.get_object(page_story['id'], fields='permalink_url')['permalink_url'] + embed_html = facebook_user.get_object(page_story['id'], fields='embed_html') + + if permalink.startswith('/'): + permalink = "https://www.facebook.com%s" % permalink + + content = """
+
%s
+
%s
+
""" % ( + message, + embed_html.get('embed_html', '') + ) + + story = { + 'title': page_story.get('story', message), + 'link': permalink, + 'description': content, + 'categories': list(categories), + 'unique_id': "fb_post:%s" % page_story['id'], + 'pubdate': created_date, + } + + return story + + \ No newline at end of file diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index a1def5d05..aea6bbbd3 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -34,6 +34,7 @@ from django.utils.encoding import smart_unicode from utils import json_functions as json from celery.exceptions import SoftTimeLimitExceeded from utils.twitter_fetcher import TwitterFetcher +from utils.facebook_fetcher import FacebookFetcher from utils.json_fetcher import JSONFetcher # from utils.feed_functions import mail_feed_error_to_admin @@ -106,6 +107,13 @@ class FetchFeed: (self.feed.log_title[:30], address)) return FEED_ERRHTTP, None self.fpf = feedparser.parse(twitter_feed) + elif re.match(r'(https?)?://(www\.)?facebook.com/\w+/?$', qurl(address, remove=['_'])): + facebook_feed = self.fetch_facebook(address) + if not facebook_feed: + logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s' % + (self.feed.log_title[:30], address)) + return FEED_ERRHTTP, None + self.fpf = feedparser.parse(facebook_feed) if not self.fpf: try: @@ -186,6 +194,10 @@ class FetchFeed: twitter_fetcher = TwitterFetcher(self.feed, self.options) return twitter_fetcher.fetch(address) + def fetch_facebook(self, address=None): + facebook_fetcher = FacebookFetcher(self.feed, self.options) + return facebook_fetcher.fetch(address) + def fetch_json_feed(self, address, headers): json_fetcher = JSONFetcher(self.feed, self.options) return json_fetcher.fetch(address, headers) diff --git a/utils/feedfinder2.py b/utils/feedfinder2.py index cf897e4a8..844fe6136 100755 --- a/utils/feedfinder2.py +++ b/utils/feedfinder2.py @@ -85,7 +85,10 @@ def find_feeds(url, check_all=False, user_agent=None): # Look for tags. logging.info("Looking for tags.") - tree = BeautifulSoup(feed_text) + try: + tree = BeautifulSoup(feed_text) + except ValueError: + return [] links = [] for link in tree.findAll("link"): if link.get("type") in ["application/rss+xml",