From a5222d967546d49c259e38af105ad2272038dd5c Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Sun, 3 Mar 2024 12:59:12 -0500 Subject: [PATCH] Updating youtube fetcher to use channels/playlists/users for everything, no longer relying on RSS/xml url. --- .vscode/settings.json | 34 +++--- config/requirements.txt | 21 ++-- utils/feed_fetcher.py | 167 +----------------------------- utils/youtube_fetcher.py | 218 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 242 insertions(+), 198 deletions(-) create mode 100644 utils/youtube_fetcher.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 65f56c0d5..6d324acf3 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,23 +1,16 @@ { - "black-formatter.args": [ - "--line-length 110" - ], "isort.args": [ "--profile", "black" ], - "editor.formatOnSave": true, - "editor.codeActionsOnSave": { - "source.organizeImports": "explicit" - }, - "python.linting.enabled": true, - "python.linting.pylintEnabled": false, - "python.linting.flake8Enabled": true, - "python.linting.pylamaEnabled": false, - "python.linting.flake8Args": [ - "--ignore=E501,W293,W503,W504,E302,E722,E226,E221,E402,E401" - ], - "python.pythonPath": "~/.virtualenvs/newsblur3/bin/python", + // "python.linting.enabled": true, + // "python.linting.pylintEnabled": false, + // "python.linting.flake8Enabled": true, + // "python.linting.pylamaEnabled": false, + // "python.linting.flake8Args": [ + // "--ignore=E501,W293,W503,W504,E302,E722,E226,E221,E402,E401" + // ], + // "python.pythonPath": "~/.virtualenvs/newsblur/bin/python", "editor.bracketPairColorization.enabled": true, "editor.guides.bracketPairs": "active", "git.ignoreLimitWarning": true, @@ -38,15 +31,12 @@ "docker/volumes": true, "requirements.txt": true, // It's just a symlink to config/requirements.txt, which has git history }, - "python.formatting.blackArgs": [ - "--line-length=110", - "--skip-string-normalization" - ], + // "python.formatting.blackArgs": [ + // "--line-length=110", + // "--skip-string-normalization" + // ], "files.associations": { "*.yml": "ansible" }, - "nrf-connect.toolchain.path": "${nrf-connect.toolchain:1.9.1}", - "C_Cpp.default.configurationProvider": "nrf-connect", - "editor.formatOnSave": false, "ansible.python.interpreterPath": "/opt/homebrew/bin/python3", } diff --git a/config/requirements.txt b/config/requirements.txt index 769a0f810..d2a3e1fe7 100644 --- a/config/requirements.txt +++ b/config/requirements.txt @@ -8,11 +8,11 @@ billiard==3.6.4.0 bleach==3.2.1 boto3==1.18.12 botocore==1.21.12 +black~=23.1.0 celery==4.4.7 certifi==2020.12.5 cffi==1.14.5 chardet==3.0.4 -click==7.1.2 ConfigArgParse==1.4 cryptography==3.4.7 cssutils==1.0.2 @@ -40,13 +40,10 @@ factory-boy==3.2.0 Faker==8.8.2 feedparser>=6,<7 filelock==3.0.12 -Flask==1.1.2 +Flask==3.0.2 Flask-BasicAuth==0.2.0 future==0.18.2 -gevent==21.1.2 -geventhttpclient==1.4.4 -greenlet==1.1.0 -gunicorn==20.1.0 +gunicorn==21.2.0 h2==2.6.2 hiredis==1.1.0 hpack==3.0.0 @@ -57,24 +54,21 @@ idna==2.10 image==1.5.33 iniconfig==1.1.1 isodate==0.6.0 -itsdangerous==1.1.0 -Jinja2==2.11.3 +Jinja2==3.1.3 jmespath==0.10.0 jsonpickle==2.0.0 kombu==4.6.11 locust==1.4.3 -lxml==4.6.2 -MarkupSafe==1.1.1 +lxml==5.1.0 mock==4.0.2 mongoengine==0.21.0 msgpack==1.0.2 ndg-httpsclient==0.5.1 nose==1.3.7 nose-exclude==0.5.0 -numpy==1.19.4 +numpy==1.26.4 oauth2==1.9.0.post1 oauthlib==3.1.0 -packaging==20.9 paypalrestsdk==1.13.1 pbr==5.6.0 Pillow==8.0.1 @@ -104,7 +98,7 @@ raven==6.10.0 redis>=4,<5 requests==2.25.0 requests-oauthlib==1.3.0 -scipy==1.5.4 +scipy==1.12.0 sentry-sdk>=1,<2 sgmllib3k==1.0.0 simplejson==3.17.2 @@ -125,7 +119,6 @@ virtualenv==20.4.6 virtualenv-clone==0.5.4 virtualenvwrapper==4.8.4 webencodings==0.5.1 -Werkzeug==1.0.1 XlsxWriter==1.3.7 zope.event==4.5.0 zope.interface==5.4.0 diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index b4cca52e9..c6b804418 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -19,9 +19,7 @@ import random import re import xml.sax -import dateutil.parser import feedparser -import isodate import pymongo import redis import requests @@ -58,6 +56,7 @@ from utils.feed_functions import TimeoutError, timelimit from utils.json_fetcher import JSONFetcher from utils.story_functions import linkify, pre_process_story, strip_tags from utils.twitter_fetcher import TwitterFetcher +from utils.youtube_fetcher import YoutubeFetcher # from utils.feed_functions import mail_feed_error_to_admin @@ -131,10 +130,7 @@ class FetchFeed: return FEED_OK, self.fpf if 'youtube.com' in address: - try: - youtube_feed = self.fetch_youtube(address) - except (requests.adapters.ConnectionError): - youtube_feed = None + youtube_feed = self.fetch_youtube() if not youtube_feed: logging.debug( ' ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.log_title[:30], address) @@ -313,162 +309,9 @@ class FetchFeed: json_fetcher = JSONFetcher(self.feed, self.options) return json_fetcher.fetch(address, headers) - def fetch_youtube(self, address): - username = None - channel_id = None - list_id = None - - if 'gdata.youtube.com' in address: - try: - username_groups = re.search(r'gdata.youtube.com/feeds/\w+/users/(\w+)/', address) - if not username_groups: - return - username = username_groups.group(1) - except IndexError: - return - elif 'youtube.com/@' in address: - try: - username = address.split('youtube.com/@')[1] - except IndexError: - return - elif 'youtube.com/feeds/videos.xml?user=' in address: - try: - username = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['user'][0] - except IndexError: - return - elif 'youtube.com/feeds/videos.xml?channel_id=' in address: - try: - channel_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['channel_id'][0] - except (IndexError, KeyError): - return - elif 'youtube.com/playlist' in address: - try: - list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['list'][0] - except IndexError: - return - elif 'youtube.com/feeds/videos.xml?playlist_id' in address: - try: - list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['playlist_id'][0] - except IndexError: - return - - if channel_id: - video_ids_xml = requests.get( - "https://www.youtube.com/feeds/videos.xml?channel_id=%s" % channel_id - ) - channel_json = requests.get( - "https://www.googleapis.com/youtube/v3/channels?part=snippet&id=%s&key=%s" - % (channel_id, settings.YOUTUBE_API_KEY) - ) - channel = json.decode(channel_json.content) - try: - username = channel['items'][0]['snippet']['title'] - description = channel['items'][0]['snippet']['description'] - except (IndexError, KeyError): - return - elif list_id: - playlist_json = requests.get( - "https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s" - % (list_id, settings.YOUTUBE_API_KEY) - ) - playlist = json.decode(playlist_json.content) - try: - username = playlist['items'][0]['snippet']['title'] - description = playlist['items'][0]['snippet']['description'] - except (IndexError, KeyError): - return - channel_url = "https://www.youtube.com/playlist?list=%s" % list_id - elif username: - video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?user=%s" % username) - description = "YouTube videos uploaded by %s" % username - else: - return - - if list_id: - playlist_json = requests.get( - "https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s" - % (list_id, settings.YOUTUBE_API_KEY) - ) - playlist = json.decode(playlist_json.content) - try: - video_ids = [video['snippet']['resourceId']['videoId'] for video in playlist['items']] - except (IndexError, KeyError): - return - else: - if video_ids_xml.status_code != 200: - return - video_ids_soup = BeautifulSoup(video_ids_xml.content, features="lxml") - channel_url = video_ids_soup.find('author').find('uri').getText() - video_ids = [] - for video_id in video_ids_soup.findAll('yt:videoid'): - video_ids.append(video_id.getText()) - - videos_json = requests.get( - "https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s" - % (','.join(video_ids), settings.YOUTUBE_API_KEY) - ) - videos = json.decode(videos_json.content) - if 'error' in videos: - logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos)) - return - - data = {} - data['title'] = "%s's YouTube Videos" % username if 'Uploads' not in username else username - data['link'] = channel_url - data['description'] = description - data['lastBuildDate'] = datetime.datetime.utcnow() - data['generator'] = 'NewsBlur YouTube API v3 Decrapifier - %s' % settings.NEWSBLUR_URL - data['docs'] = None - data['feed_url'] = address - rss = feedgenerator.Atom1Feed(**data) - - for video in videos['items']: - thumbnail = video['snippet']['thumbnails'].get('maxres') - if not thumbnail: - thumbnail = video['snippet']['thumbnails'].get('high') - if not thumbnail: - thumbnail = video['snippet']['thumbnails'].get('medium') - duration_sec = isodate.parse_duration(video['contentDetails']['duration']).seconds - duration_min, seconds = divmod(duration_sec, 60) - hours, minutes = divmod(duration_min, 60) - if hours >= 1: - duration = "%s:%s:%s" % ( - hours, - '{0:02d}'.format(minutes), - '{0:02d}'.format(seconds), - ) - else: - duration = "%s:%s" % (minutes, '{0:02d}'.format(seconds)) - content = """
- -
-
- From: %s
- Duration: %s
-

-
%s
- """ % ( - ("https://www.youtube.com/embed/" + video['id']), - channel_url, - username, - duration, - linkify(linebreaks(video['snippet']['description'])), - thumbnail['url'] if thumbnail else "", - ) - - link = "http://www.youtube.com/watch?v=%s" % video['id'] - story_data = { - 'title': video['snippet']['title'], - 'link': link, - 'description': content, - 'author_name': username, - 'categories': [], - 'unique_id': "tag:youtube.com,2008:video:%s" % video['id'], - 'pubdate': dateutil.parser.parse(video['snippet']['publishedAt']), - } - rss.add_item(**story_data) - - return rss.writeString('utf-8') + def fetch_youtube(self): + youtube_fetcher = YoutubeFetcher(self.feed, self.options) + return youtube_fetcher.fetch() class ProcessFeed: diff --git a/utils/youtube_fetcher.py b/utils/youtube_fetcher.py new file mode 100644 index 000000000..6dac327ba --- /dev/null +++ b/utils/youtube_fetcher.py @@ -0,0 +1,218 @@ +import datetime +from utils import json_functions as json +import isodate +import re +import urllib.error +import urllib.parse +import urllib.request + +import dateutil.parser +import requests +from django.conf import settings +from django.utils import feedgenerator +from django.utils.html import linebreaks + +from apps.reader.models import UserSubscription +from apps.social.models import MSocialServices +from utils.story_functions import linkify +from utils import log as logging + + +class YoutubeFetcher: + def __init__(self, feed, options=None): + self.feed = feed + self.options = options or {} + self.address = self.feed.feed_address + + def fetch(self): + username = self.extract_username(self.address) + channel_id = self.extract_channel_id(self.address) + list_id = self.extract_list_id(self.address) + video_ids = None + + if channel_id: + video_ids, title, description = self.fetch_channel_videos(channel_id) + channel_url = "https://www.youtube.com/channel/%s" % channel_id + elif list_id: + video_ids, title, description = self.fetch_playlist_videos(list_id) + channel_url = "https://www.youtube.com/playlist?list=%s" % list_id + elif username: + video_ids, title, description = self.fetch_user_videos(username) + channel_url = "https://www.youtube.com/user/%s" % username + + if not video_ids: + return + + videos = self.fetch_videos(video_ids) + data = {} + if username: + data["title"] = f"{username}'s YouTube Videos" + else: + data["title"] = title + data["link"] = channel_url + data["description"] = description + data["lastBuildDate"] = datetime.datetime.utcnow() + data["generator"] = "NewsBlur YouTube API v3 Decrapifier - %s" % settings.NEWSBLUR_URL + data["docs"] = None + data["feed_url"] = self.address + rss = feedgenerator.Atom1Feed(**data) + + for video in videos["items"]: + thumbnail = video["snippet"]["thumbnails"].get("maxres") + if not thumbnail: + thumbnail = video["snippet"]["thumbnails"].get("high") + if not thumbnail: + thumbnail = video["snippet"]["thumbnails"].get("medium") + duration_sec = isodate.parse_duration(video["contentDetails"]["duration"]).seconds + duration_min, seconds = divmod(duration_sec, 60) + hours, minutes = divmod(duration_min, 60) + if hours >= 1: + duration = "%s:%s:%s" % ( + hours, + "{0:02d}".format(minutes), + "{0:02d}".format(seconds), + ) + else: + duration = "%s:%s" % (minutes, "{0:02d}".format(seconds)) + content = """
+ +
+
+ From: %s
+ Duration: %s
+

+
%s
+ """ % ( + ("https://www.youtube.com/embed/" + video["id"]), + channel_url, + username, + duration, + linkify(linebreaks(video["snippet"]["description"])), + thumbnail["url"] if thumbnail else "", + ) + + link = "http://www.youtube.com/watch?v=%s" % video["id"] + story_data = { + "title": video["snippet"]["title"], + "link": link, + "description": content, + "author_name": username, + "categories": [], + "unique_id": "tag:youtube.com,2008:video:%s" % video["id"], + "pubdate": dateutil.parser.parse(video["snippet"]["publishedAt"]), + } + rss.add_item(**story_data) + + return rss.writeString("utf-8") + + def extract_username(self, url): + if "gdata.youtube.com" in url: + try: + username_groups = re.search(r"gdata.youtube.com/feeds/\w+/users/(\w+)/", url) + if not username_groups: + return + return username_groups.group(1) + except IndexError: + return + elif "youtube.com/@" in url: + try: + return url.split("youtube.com/@")[1] + except IndexError: + return + elif "youtube.com/feeds/videos.xml?user=" in url: + try: + return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["user"][0] + except IndexError: + return + elif "youtube.com/user/" in url: + username = re.findall(r"youtube.com/user/([^/]+)", url) + if username: + return username[0] + + def extract_channel_id(self, url): + if "youtube.com/feeds/videos.xml?channel_id=" in url: + try: + return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["channel_id"][0] + except (IndexError, KeyError): + return + + def extract_list_id(self, url): + if "youtube.com/playlist" in url: + try: + return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["list"][0] + except IndexError: + return + elif "youtube.com/feeds/videos.xml?playlist_id" in url: + try: + return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["playlist_id"][0] + except IndexError: + return + + def fetch_videos(self, video_ids): + videos_json = requests.get( + "https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s" + % (",".join(video_ids), settings.YOUTUBE_API_KEY) + ) + videos = json.decode(videos_json.content) + if "error" in videos: + logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos)) + return + return videos + + def fetch_channel_videos(self, channel_id): + channel_json = requests.get( + "https://www.googleapis.com/youtube/v3/channels?part=snippet,contentDetails&id=%s&key=%s" + % (channel_id, settings.YOUTUBE_API_KEY) + ) + channel = json.decode(channel_json.content) + try: + title = channel["items"][0]["snippet"]["title"] + description = channel["items"][0]["snippet"]["description"] + uploads_list_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"] + except (IndexError, KeyError): + return + + return self.fetch_playlist_videos(uploads_list_id, title, description) + + def fetch_playlist_videos(self, list_id, title=None, description=None): + if not title and not description: + playlist_json = requests.get( + "https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s" + % (list_id, settings.YOUTUBE_API_KEY) + ) + playlist = json.decode(playlist_json.content) + try: + title = playlist["items"][0]["snippet"]["title"] + description = playlist["items"][0]["snippet"]["description"] + except (IndexError, KeyError): + return None, None, None + + playlist_json = requests.get( + "https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s" + % (list_id, settings.YOUTUBE_API_KEY) + ) + playlist = json.decode(playlist_json.content) + try: + video_ids = [video["snippet"]["resourceId"]["videoId"] for video in playlist["items"]] + except (IndexError, KeyError): + return None, None, None + + return video_ids, title, description + + def fetch_user_videos(self, username, username_key="forUsername"): + channel_json = requests.get( + "https://www.googleapis.com/youtube/v3/channels?part=snippet,contentDetails&%s=%s&key=%s" + % (username_key, username, settings.YOUTUBE_API_KEY) + ) + channel = json.decode(channel_json.content) + try: + title = channel["items"][0]["snippet"]["title"] + description = channel["items"][0]["snippet"]["description"] + uploads_list_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"] + except (IndexError, KeyError): + return + + if not uploads_list_id and username_key == "forUsername": + return self.fetch_user_videos(username, username_key="forHandle") + + return self.fetch_playlist_videos(uploads_list_id, title, description)