Updating youtube fetcher to use channels/playlists/users for everything, no longer relying on RSS/xml url.

2025-08-31 22:20:12 +00:00 · 2024-03-03 12:59:12 -05:00 · 2024-03-03 12:59:12 -05:00 · a5222d9675
commit a5222d9675
parent ec44039cc9
4 changed files with 242 additions and 198 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,23 +1,16 @@
 {
    "black-formatter.args": [
        "--line-length 110"
    ],
    "isort.args": [
        "--profile",
        "black"
    ],
-    "editor.formatOnSave": true,
+    // "python.linting.enabled": true,
-    "editor.codeActionsOnSave": {
+    // "python.linting.pylintEnabled": false,
-        "source.organizeImports": "explicit"
+    // "python.linting.flake8Enabled": true,
-    },
+    // "python.linting.pylamaEnabled": false,
-    "python.linting.enabled": true,
+    // "python.linting.flake8Args": [
-    "python.linting.pylintEnabled": false,
+    //     "--ignore=E501,W293,W503,W504,E302,E722,E226,E221,E402,E401"
-    "python.linting.flake8Enabled": true,
+    // ],
-    "python.linting.pylamaEnabled": false,
+    // "python.pythonPath": "~/.virtualenvs/newsblur/bin/python",
    "python.linting.flake8Args": [
        "--ignore=E501,W293,W503,W504,E302,E722,E226,E221,E402,E401"
    ],
    "python.pythonPath": "~/.virtualenvs/newsblur3/bin/python",
    "editor.bracketPairColorization.enabled": true,
    "editor.guides.bracketPairs": "active",
    "git.ignoreLimitWarning": true,
@ -38,15 +31,12 @@
        "docker/volumes": true,
        "requirements.txt": true, // It's just a symlink to config/requirements.txt, which has git history
    },
-    "python.formatting.blackArgs": [
+    // "python.formatting.blackArgs": [
-        "--line-length=110",
+    //     "--line-length=110",
-        "--skip-string-normalization"
+    //     "--skip-string-normalization"
-    ],
+    // ],
    "files.associations": {
        "*.yml": "ansible"
    },
    "nrf-connect.toolchain.path": "${nrf-connect.toolchain:1.9.1}",
    "C_Cpp.default.configurationProvider": "nrf-connect",
    "editor.formatOnSave": false,
    "ansible.python.interpreterPath": "/opt/homebrew/bin/python3",
 }
--- a/config/requirements.txt
+++ b/config/requirements.txt
@ -8,11 +8,11 @@ billiard==3.6.4.0
 bleach==3.2.1
 boto3==1.18.12
 botocore==1.21.12
 black~=23.1.0
 celery==4.4.7
 certifi==2020.12.5
 cffi==1.14.5
 chardet==3.0.4
 click==7.1.2
 ConfigArgParse==1.4
 cryptography==3.4.7
 cssutils==1.0.2
@ -40,13 +40,10 @@ factory-boy==3.2.0
 Faker==8.8.2
 feedparser>=6,<7
 filelock==3.0.12
-Flask==1.1.2
+Flask==3.0.2
 Flask-BasicAuth==0.2.0
 future==0.18.2
-gevent==21.1.2
+gunicorn==21.2.0
 geventhttpclient==1.4.4
 greenlet==1.1.0
 gunicorn==20.1.0
 h2==2.6.2
 hiredis==1.1.0
 hpack==3.0.0
@ -57,24 +54,21 @@ idna==2.10
 image==1.5.33
 iniconfig==1.1.1
 isodate==0.6.0
-itsdangerous==1.1.0
+Jinja2==3.1.3
 Jinja2==2.11.3
 jmespath==0.10.0
 jsonpickle==2.0.0
 kombu==4.6.11
 locust==1.4.3
-lxml==4.6.2
+lxml==5.1.0
 MarkupSafe==1.1.1
 mock==4.0.2
 mongoengine==0.21.0
 msgpack==1.0.2
 ndg-httpsclient==0.5.1
 nose==1.3.7
 nose-exclude==0.5.0
-numpy==1.19.4
+numpy==1.26.4
 oauth2==1.9.0.post1
 oauthlib==3.1.0
 packaging==20.9
 paypalrestsdk==1.13.1
 pbr==5.6.0
 Pillow==8.0.1
@ -104,7 +98,7 @@ raven==6.10.0
 redis>=4,<5
 requests==2.25.0
 requests-oauthlib==1.3.0
-scipy==1.5.4
+scipy==1.12.0
 sentry-sdk>=1,<2
 sgmllib3k==1.0.0
 simplejson==3.17.2
@ -125,7 +119,6 @@ virtualenv==20.4.6
 virtualenv-clone==0.5.4
 virtualenvwrapper==4.8.4
 webencodings==0.5.1
 Werkzeug==1.0.1
 XlsxWriter==1.3.7
 zope.event==4.5.0
 zope.interface==5.4.0
--- a/utils/feed_fetcher.py
+++ b/utils/feed_fetcher.py
@ -19,9 +19,7 @@ import random
 import re
 import xml.sax
 import dateutil.parser
 import feedparser
 import isodate
 import pymongo
 import redis
 import requests
@ -58,6 +56,7 @@ from utils.feed_functions import TimeoutError, timelimit
 from utils.json_fetcher import JSONFetcher
 from utils.story_functions import linkify, pre_process_story, strip_tags
 from utils.twitter_fetcher import TwitterFetcher
 from utils.youtube_fetcher import YoutubeFetcher
 # from utils.feed_functions import mail_feed_error_to_admin
@ -131,10 +130,7 @@ class FetchFeed:
            return FEED_OK, self.fpf
        if 'youtube.com' in address:
-            try:
+            youtube_feed = self.fetch_youtube()
                youtube_feed = self.fetch_youtube(address)
            except (requests.adapters.ConnectionError):
                youtube_feed = None
            if not youtube_feed:
                logging.debug(
                    '   ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.log_title[:30], address)
@ -313,162 +309,9 @@ class FetchFeed:
        json_fetcher = JSONFetcher(self.feed, self.options)
        return json_fetcher.fetch(address, headers)
-    def fetch_youtube(self, address):
+    def fetch_youtube(self):
-        username = None
+        youtube_fetcher = YoutubeFetcher(self.feed, self.options)
-        channel_id = None
+        return youtube_fetcher.fetch()
        list_id = None
        if 'gdata.youtube.com' in address:
            try:
                username_groups = re.search(r'gdata.youtube.com/feeds/\w+/users/(\w+)/', address)
                if not username_groups:
                    return
                username = username_groups.group(1)
            except IndexError:
                return
        elif 'youtube.com/@' in address:
            try:
                username = address.split('youtube.com/@')[1]
            except IndexError:
                return
        elif 'youtube.com/feeds/videos.xml?user=' in address:
            try:
                username = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['user'][0]
            except IndexError:
                return
        elif 'youtube.com/feeds/videos.xml?channel_id=' in address:
            try:
                channel_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['channel_id'][0]
            except (IndexError, KeyError):
                return
        elif 'youtube.com/playlist' in address:
            try:
                list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['list'][0]
            except IndexError:
                return
        elif 'youtube.com/feeds/videos.xml?playlist_id' in address:
            try:
                list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['playlist_id'][0]
            except IndexError:
                return
        if channel_id:
            video_ids_xml = requests.get(
                "https://www.youtube.com/feeds/videos.xml?channel_id=%s" % channel_id
            )
            channel_json = requests.get(
                "https://www.googleapis.com/youtube/v3/channels?part=snippet&id=%s&key=%s"
                % (channel_id, settings.YOUTUBE_API_KEY)
            )
            channel = json.decode(channel_json.content)
            try:
                username = channel['items'][0]['snippet']['title']
                description = channel['items'][0]['snippet']['description']
            except (IndexError, KeyError):
                return
        elif list_id:
            playlist_json = requests.get(
                "https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s"
                % (list_id, settings.YOUTUBE_API_KEY)
            )
            playlist = json.decode(playlist_json.content)
            try:
                username = playlist['items'][0]['snippet']['title']
                description = playlist['items'][0]['snippet']['description']
            except (IndexError, KeyError):
                return
            channel_url = "https://www.youtube.com/playlist?list=%s" % list_id
        elif username:
            video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?user=%s" % username)
            description = "YouTube videos uploaded by %s" % username
        else:
            return
        if list_id:
            playlist_json = requests.get(
                "https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s"
                % (list_id, settings.YOUTUBE_API_KEY)
            )
            playlist = json.decode(playlist_json.content)
            try:
                video_ids = [video['snippet']['resourceId']['videoId'] for video in playlist['items']]
            except (IndexError, KeyError):
                return
        else:
            if video_ids_xml.status_code != 200:
                return
            video_ids_soup = BeautifulSoup(video_ids_xml.content, features="lxml")
            channel_url = video_ids_soup.find('author').find('uri').getText()
            video_ids = []
            for video_id in video_ids_soup.findAll('yt:videoid'):
                video_ids.append(video_id.getText())
        videos_json = requests.get(
            "https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s"
            % (','.join(video_ids), settings.YOUTUBE_API_KEY)
        )
        videos = json.decode(videos_json.content)
        if 'error' in videos:
            logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos))
            return
        data = {}
        data['title'] = "%s's YouTube Videos" % username if 'Uploads' not in username else username
        data['link'] = channel_url
        data['description'] = description
        data['lastBuildDate'] = datetime.datetime.utcnow()
        data['generator'] = 'NewsBlur YouTube API v3 Decrapifier - %s' % settings.NEWSBLUR_URL
        data['docs'] = None
        data['feed_url'] = address
        rss = feedgenerator.Atom1Feed(**data)
        for video in videos['items']:
            thumbnail = video['snippet']['thumbnails'].get('maxres')
            if not thumbnail:
                thumbnail = video['snippet']['thumbnails'].get('high')
            if not thumbnail:
                thumbnail = video['snippet']['thumbnails'].get('medium')
            duration_sec = isodate.parse_duration(video['contentDetails']['duration']).seconds
            duration_min, seconds = divmod(duration_sec, 60)
            hours, minutes = divmod(duration_min, 60)
            if hours >= 1:
                duration = "%s:%s:%s" % (
                    hours,
                    '{0:02d}'.format(minutes),
                    '{0:02d}'.format(seconds),
                )
            else:
                duration = "%s:%s" % (minutes, '{0:02d}'.format(seconds))
            content = """<div class="NB-youtube-player">
                            <iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe>
                         </div>
                         <div class="NB-youtube-stats"><small>
                             <b>From:</b> <a href="%s">%s</a><br />
                             <b>Duration:</b> %s<br />
                         </small></div><hr>
                         <div class="NB-youtube-description">%s</div>
                         <img src="%s" style="display:none" />""" % (
                ("https://www.youtube.com/embed/" + video['id']),
                channel_url,
                username,
                duration,
                linkify(linebreaks(video['snippet']['description'])),
                thumbnail['url'] if thumbnail else "",
            )
            link = "http://www.youtube.com/watch?v=%s" % video['id']
            story_data = {
                'title': video['snippet']['title'],
                'link': link,
                'description': content,
                'author_name': username,
                'categories': [],
                'unique_id': "tag:youtube.com,2008:video:%s" % video['id'],
                'pubdate': dateutil.parser.parse(video['snippet']['publishedAt']),
            }
            rss.add_item(**story_data)
        return rss.writeString('utf-8')
 class ProcessFeed:
--- a/utils/youtube_fetcher.py
+++ b/utils/youtube_fetcher.py
@ -0,0 +1,218 @@
 import datetime
 from utils import json_functions as json
 import isodate
 import re
 import urllib.error
 import urllib.parse
 import urllib.request
 import dateutil.parser
 import requests
 from django.conf import settings
 from django.utils import feedgenerator
 from django.utils.html import linebreaks
 from apps.reader.models import UserSubscription
 from apps.social.models import MSocialServices
 from utils.story_functions import linkify
 from utils import log as logging
 class YoutubeFetcher:
    def __init__(self, feed, options=None):
        self.feed = feed
        self.options = options or {}
        self.address = self.feed.feed_address
    def fetch(self):
        username = self.extract_username(self.address)
        channel_id = self.extract_channel_id(self.address)
        list_id = self.extract_list_id(self.address)
        video_ids = None
        if channel_id:
            video_ids, title, description = self.fetch_channel_videos(channel_id)
            channel_url = "https://www.youtube.com/channel/%s" % channel_id
        elif list_id:
            video_ids, title, description = self.fetch_playlist_videos(list_id)
            channel_url = "https://www.youtube.com/playlist?list=%s" % list_id
        elif username:
            video_ids, title, description = self.fetch_user_videos(username)
            channel_url = "https://www.youtube.com/user/%s" % username
        if not video_ids:
            return
        videos = self.fetch_videos(video_ids)
        data = {}
        if username:
            data["title"] = f"{username}'s YouTube Videos"
        else:
            data["title"] = title
        data["link"] = channel_url
        data["description"] = description
        data["lastBuildDate"] = datetime.datetime.utcnow()
        data["generator"] = "NewsBlur YouTube API v3 Decrapifier - %s" % settings.NEWSBLUR_URL
        data["docs"] = None
        data["feed_url"] = self.address
        rss = feedgenerator.Atom1Feed(**data)
        for video in videos["items"]:
            thumbnail = video["snippet"]["thumbnails"].get("maxres")
            if not thumbnail:
                thumbnail = video["snippet"]["thumbnails"].get("high")
            if not thumbnail:
                thumbnail = video["snippet"]["thumbnails"].get("medium")
            duration_sec = isodate.parse_duration(video["contentDetails"]["duration"]).seconds
            duration_min, seconds = divmod(duration_sec, 60)
            hours, minutes = divmod(duration_min, 60)
            if hours >= 1:
                duration = "%s:%s:%s" % (
                    hours,
                    "{0:02d}".format(minutes),
                    "{0:02d}".format(seconds),
                )
            else:
                duration = "%s:%s" % (minutes, "{0:02d}".format(seconds))
            content = """<div class="NB-youtube-player">
                            <iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe>
                         </div>
                         <div class="NB-youtube-stats"><small>
                             <b>From:</b> <a href="%s">%s</a><br />
                             <b>Duration:</b> %s<br />
                         </small></div><hr>
                         <div class="NB-youtube-description">%s</div>
                         <img src="%s" style="display:none" />""" % (
                ("https://www.youtube.com/embed/" + video["id"]),
                channel_url,
                username,
                duration,
                linkify(linebreaks(video["snippet"]["description"])),
                thumbnail["url"] if thumbnail else "",
            )
            link = "http://www.youtube.com/watch?v=%s" % video["id"]
            story_data = {
                "title": video["snippet"]["title"],
                "link": link,
                "description": content,
                "author_name": username,
                "categories": [],
                "unique_id": "tag:youtube.com,2008:video:%s" % video["id"],
                "pubdate": dateutil.parser.parse(video["snippet"]["publishedAt"]),
            }
            rss.add_item(**story_data)
        return rss.writeString("utf-8")
    def extract_username(self, url):
        if "gdata.youtube.com" in url:
            try:
                username_groups = re.search(r"gdata.youtube.com/feeds/\w+/users/(\w+)/", url)
                if not username_groups:
                    return
                return username_groups.group(1)
            except IndexError:
                return
        elif "youtube.com/@" in url:
            try:
                return url.split("youtube.com/@")[1]
            except IndexError:
                return
        elif "youtube.com/feeds/videos.xml?user=" in url:
            try:
                return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["user"][0]
            except IndexError:
                return
        elif "youtube.com/user/" in url:
            username = re.findall(r"youtube.com/user/([^/]+)", url)
            if username:
                return username[0]
    def extract_channel_id(self, url):
        if "youtube.com/feeds/videos.xml?channel_id=" in url:
            try:
                return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["channel_id"][0]
            except (IndexError, KeyError):
                return
    def extract_list_id(self, url):
        if "youtube.com/playlist" in url:
            try:
                return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["list"][0]
            except IndexError:
                return
        elif "youtube.com/feeds/videos.xml?playlist_id" in url:
            try:
                return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["playlist_id"][0]
            except IndexError:
                return
    def fetch_videos(self, video_ids):
        videos_json = requests.get(
            "https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s"
            % (",".join(video_ids), settings.YOUTUBE_API_KEY)
        )
        videos = json.decode(videos_json.content)
        if "error" in videos:
            logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos))
            return
        return videos
    def fetch_channel_videos(self, channel_id):
        channel_json = requests.get(
            "https://www.googleapis.com/youtube/v3/channels?part=snippet,contentDetails&id=%s&key=%s"
            % (channel_id, settings.YOUTUBE_API_KEY)
        )
        channel = json.decode(channel_json.content)
        try:
            title = channel["items"][0]["snippet"]["title"]
            description = channel["items"][0]["snippet"]["description"]
            uploads_list_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
        except (IndexError, KeyError):
            return
        return self.fetch_playlist_videos(uploads_list_id, title, description)
    def fetch_playlist_videos(self, list_id, title=None, description=None):
        if not title and not description:
            playlist_json = requests.get(
                "https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s"
                % (list_id, settings.YOUTUBE_API_KEY)
            )
            playlist = json.decode(playlist_json.content)
            try:
                title = playlist["items"][0]["snippet"]["title"]
                description = playlist["items"][0]["snippet"]["description"]
            except (IndexError, KeyError):
                return None, None, None
        playlist_json = requests.get(
            "https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s"
            % (list_id, settings.YOUTUBE_API_KEY)
        )
        playlist = json.decode(playlist_json.content)
        try:
            video_ids = [video["snippet"]["resourceId"]["videoId"] for video in playlist["items"]]
        except (IndexError, KeyError):
            return None, None, None
        return video_ids, title, description
    def fetch_user_videos(self, username, username_key="forUsername"):
        channel_json = requests.get(
            "https://www.googleapis.com/youtube/v3/channels?part=snippet,contentDetails&%s=%s&key=%s"
            % (username_key, username, settings.YOUTUBE_API_KEY)
        )
        channel = json.decode(channel_json.content)
        try:
            title = channel["items"][0]["snippet"]["title"]
            description = channel["items"][0]["snippet"]["description"]
            uploads_list_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
        except (IndexError, KeyError):
            return
        if not uploads_list_id and username_key == "forUsername":
            return self.fetch_user_videos(username, username_key="forHandle")
        return self.fetch_playlist_videos(uploads_list_id, title, description)