From a5222d967546d49c259e38af105ad2272038dd5c Mon Sep 17 00:00:00 2001
From: Samuel Clay <samuel@ofbrooklyn.com>
Date: Sun, 3 Mar 2024 12:59:12 -0500
Subject: [PATCH] Updating youtube fetcher to use channels/playlists/users for
 everything, no longer relying on RSS/xml url.

---
 .vscode/settings.json    |  34 +++---
 config/requirements.txt  |  21 ++--
 utils/feed_fetcher.py    | 167 +-----------------------------
 utils/youtube_fetcher.py | 218 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 242 insertions(+), 198 deletions(-)
 create mode 100644 utils/youtube_fetcher.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 65f56c0d5..6d324acf3 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,23 +1,16 @@
 {
-    "black-formatter.args": [
-        "--line-length 110"
-    ],
     "isort.args": [
         "--profile",
         "black"
     ],
-    "editor.formatOnSave": true,
-    "editor.codeActionsOnSave": {
-        "source.organizeImports": "explicit"
-    },
-    "python.linting.enabled": true,
-    "python.linting.pylintEnabled": false,
-    "python.linting.flake8Enabled": true,
-    "python.linting.pylamaEnabled": false,
-    "python.linting.flake8Args": [
-        "--ignore=E501,W293,W503,W504,E302,E722,E226,E221,E402,E401"
-    ],
-    "python.pythonPath": "~/.virtualenvs/newsblur3/bin/python",
+    // "python.linting.enabled": true,
+    // "python.linting.pylintEnabled": false,
+    // "python.linting.flake8Enabled": true,
+    // "python.linting.pylamaEnabled": false,
+    // "python.linting.flake8Args": [
+    //     "--ignore=E501,W293,W503,W504,E302,E722,E226,E221,E402,E401"
+    // ],
+    // "python.pythonPath": "~/.virtualenvs/newsblur/bin/python",
     "editor.bracketPairColorization.enabled": true,
     "editor.guides.bracketPairs": "active",
     "git.ignoreLimitWarning": true,
@@ -38,15 +31,12 @@
         "docker/volumes": true,
         "requirements.txt": true, // It's just a symlink to config/requirements.txt, which has git history
     },
-    "python.formatting.blackArgs": [
-        "--line-length=110",
-        "--skip-string-normalization"
-    ],
+    // "python.formatting.blackArgs": [
+    //     "--line-length=110",
+    //     "--skip-string-normalization"
+    // ],
     "files.associations": {
         "*.yml": "ansible"
     },
-    "nrf-connect.toolchain.path": "${nrf-connect.toolchain:1.9.1}",
-    "C_Cpp.default.configurationProvider": "nrf-connect",
-    "editor.formatOnSave": false,
     "ansible.python.interpreterPath": "/opt/homebrew/bin/python3",
 }
diff --git a/config/requirements.txt b/config/requirements.txt
index 769a0f810..d2a3e1fe7 100644
--- a/config/requirements.txt
+++ b/config/requirements.txt
@@ -8,11 +8,11 @@ billiard==3.6.4.0
 bleach==3.2.1
 boto3==1.18.12
 botocore==1.21.12
+black~=23.1.0
 celery==4.4.7
 certifi==2020.12.5
 cffi==1.14.5
 chardet==3.0.4
-click==7.1.2
 ConfigArgParse==1.4
 cryptography==3.4.7
 cssutils==1.0.2
@@ -40,13 +40,10 @@ factory-boy==3.2.0
 Faker==8.8.2
 feedparser>=6,<7
 filelock==3.0.12
-Flask==1.1.2
+Flask==3.0.2
 Flask-BasicAuth==0.2.0
 future==0.18.2
-gevent==21.1.2
-geventhttpclient==1.4.4
-greenlet==1.1.0
-gunicorn==20.1.0
+gunicorn==21.2.0
 h2==2.6.2
 hiredis==1.1.0
 hpack==3.0.0
@@ -57,24 +54,21 @@ idna==2.10
 image==1.5.33
 iniconfig==1.1.1
 isodate==0.6.0
-itsdangerous==1.1.0
-Jinja2==2.11.3
+Jinja2==3.1.3
 jmespath==0.10.0
 jsonpickle==2.0.0
 kombu==4.6.11
 locust==1.4.3
-lxml==4.6.2
-MarkupSafe==1.1.1
+lxml==5.1.0
 mock==4.0.2
 mongoengine==0.21.0
 msgpack==1.0.2
 ndg-httpsclient==0.5.1
 nose==1.3.7
 nose-exclude==0.5.0
-numpy==1.19.4
+numpy==1.26.4
 oauth2==1.9.0.post1
 oauthlib==3.1.0
-packaging==20.9
 paypalrestsdk==1.13.1
 pbr==5.6.0
 Pillow==8.0.1
@@ -104,7 +98,7 @@ raven==6.10.0
 redis>=4,<5
 requests==2.25.0
 requests-oauthlib==1.3.0
-scipy==1.5.4
+scipy==1.12.0
 sentry-sdk>=1,<2
 sgmllib3k==1.0.0
 simplejson==3.17.2
@@ -125,7 +119,6 @@ virtualenv==20.4.6
 virtualenv-clone==0.5.4
 virtualenvwrapper==4.8.4
 webencodings==0.5.1
-Werkzeug==1.0.1
 XlsxWriter==1.3.7
 zope.event==4.5.0
 zope.interface==5.4.0
diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py
index b4cca52e9..c6b804418 100644
--- a/utils/feed_fetcher.py
+++ b/utils/feed_fetcher.py
@@ -19,9 +19,7 @@ import random
 import re
 import xml.sax
 
-import dateutil.parser
 import feedparser
-import isodate
 import pymongo
 import redis
 import requests
@@ -58,6 +56,7 @@ from utils.feed_functions import TimeoutError, timelimit
 from utils.json_fetcher import JSONFetcher
 from utils.story_functions import linkify, pre_process_story, strip_tags
 from utils.twitter_fetcher import TwitterFetcher
+from utils.youtube_fetcher import YoutubeFetcher
 
 # from utils.feed_functions import mail_feed_error_to_admin
 
@@ -131,10 +130,7 @@ class FetchFeed:
             return FEED_OK, self.fpf
 
         if 'youtube.com' in address:
-            try:
-                youtube_feed = self.fetch_youtube(address)
-            except (requests.adapters.ConnectionError):
-                youtube_feed = None
+            youtube_feed = self.fetch_youtube()
             if not youtube_feed:
                 logging.debug(
                     '   ***> [%-30s] ~FRYouTube fetch failed: %s.' % (self.feed.log_title[:30], address)
@@ -313,162 +309,9 @@ class FetchFeed:
         json_fetcher = JSONFetcher(self.feed, self.options)
         return json_fetcher.fetch(address, headers)
 
-    def fetch_youtube(self, address):
-        username = None
-        channel_id = None
-        list_id = None
-
-        if 'gdata.youtube.com' in address:
-            try:
-                username_groups = re.search(r'gdata.youtube.com/feeds/\w+/users/(\w+)/', address)
-                if not username_groups:
-                    return
-                username = username_groups.group(1)
-            except IndexError:
-                return
-        elif 'youtube.com/@' in address:
-            try:
-                username = address.split('youtube.com/@')[1]
-            except IndexError:
-                return
-        elif 'youtube.com/feeds/videos.xml?user=' in address:
-            try:
-                username = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['user'][0]
-            except IndexError:
-                return
-        elif 'youtube.com/feeds/videos.xml?channel_id=' in address:
-            try:
-                channel_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['channel_id'][0]
-            except (IndexError, KeyError):
-                return
-        elif 'youtube.com/playlist' in address:
-            try:
-                list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['list'][0]
-            except IndexError:
-                return
-        elif 'youtube.com/feeds/videos.xml?playlist_id' in address:
-            try:
-                list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['playlist_id'][0]
-            except IndexError:
-                return
-
-        if channel_id:
-            video_ids_xml = requests.get(
-                "https://www.youtube.com/feeds/videos.xml?channel_id=%s" % channel_id
-            )
-            channel_json = requests.get(
-                "https://www.googleapis.com/youtube/v3/channels?part=snippet&id=%s&key=%s"
-                % (channel_id, settings.YOUTUBE_API_KEY)
-            )
-            channel = json.decode(channel_json.content)
-            try:
-                username = channel['items'][0]['snippet']['title']
-                description = channel['items'][0]['snippet']['description']
-            except (IndexError, KeyError):
-                return
-        elif list_id:
-            playlist_json = requests.get(
-                "https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s"
-                % (list_id, settings.YOUTUBE_API_KEY)
-            )
-            playlist = json.decode(playlist_json.content)
-            try:
-                username = playlist['items'][0]['snippet']['title']
-                description = playlist['items'][0]['snippet']['description']
-            except (IndexError, KeyError):
-                return
-            channel_url = "https://www.youtube.com/playlist?list=%s" % list_id
-        elif username:
-            video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?user=%s" % username)
-            description = "YouTube videos uploaded by %s" % username
-        else:
-            return
-
-        if list_id:
-            playlist_json = requests.get(
-                "https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s"
-                % (list_id, settings.YOUTUBE_API_KEY)
-            )
-            playlist = json.decode(playlist_json.content)
-            try:
-                video_ids = [video['snippet']['resourceId']['videoId'] for video in playlist['items']]
-            except (IndexError, KeyError):
-                return
-        else:
-            if video_ids_xml.status_code != 200:
-                return
-            video_ids_soup = BeautifulSoup(video_ids_xml.content, features="lxml")
-            channel_url = video_ids_soup.find('author').find('uri').getText()
-            video_ids = []
-            for video_id in video_ids_soup.findAll('yt:videoid'):
-                video_ids.append(video_id.getText())
-
-        videos_json = requests.get(
-            "https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s"
-            % (','.join(video_ids), settings.YOUTUBE_API_KEY)
-        )
-        videos = json.decode(videos_json.content)
-        if 'error' in videos:
-            logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos))
-            return
-
-        data = {}
-        data['title'] = "%s's YouTube Videos" % username if 'Uploads' not in username else username
-        data['link'] = channel_url
-        data['description'] = description
-        data['lastBuildDate'] = datetime.datetime.utcnow()
-        data['generator'] = 'NewsBlur YouTube API v3 Decrapifier - %s' % settings.NEWSBLUR_URL
-        data['docs'] = None
-        data['feed_url'] = address
-        rss = feedgenerator.Atom1Feed(**data)
-
-        for video in videos['items']:
-            thumbnail = video['snippet']['thumbnails'].get('maxres')
-            if not thumbnail:
-                thumbnail = video['snippet']['thumbnails'].get('high')
-            if not thumbnail:
-                thumbnail = video['snippet']['thumbnails'].get('medium')
-            duration_sec = isodate.parse_duration(video['contentDetails']['duration']).seconds
-            duration_min, seconds = divmod(duration_sec, 60)
-            hours, minutes = divmod(duration_min, 60)
-            if hours >= 1:
-                duration = "%s:%s:%s" % (
-                    hours,
-                    '{0:02d}'.format(minutes),
-                    '{0:02d}'.format(seconds),
-                )
-            else:
-                duration = "%s:%s" % (minutes, '{0:02d}'.format(seconds))
-            content = """<div class="NB-youtube-player">
-                            <iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe>
-                         </div>
-                         <div class="NB-youtube-stats"><small>
-                             <b>From:</b> <a href="%s">%s</a><br />
-                             <b>Duration:</b> %s<br />
-                         </small></div><hr>
-                         <div class="NB-youtube-description">%s</div>
-                         <img src="%s" style="display:none" />""" % (
-                ("https://www.youtube.com/embed/" + video['id']),
-                channel_url,
-                username,
-                duration,
-                linkify(linebreaks(video['snippet']['description'])),
-                thumbnail['url'] if thumbnail else "",
-            )
-
-            link = "http://www.youtube.com/watch?v=%s" % video['id']
-            story_data = {
-                'title': video['snippet']['title'],
-                'link': link,
-                'description': content,
-                'author_name': username,
-                'categories': [],
-                'unique_id': "tag:youtube.com,2008:video:%s" % video['id'],
-                'pubdate': dateutil.parser.parse(video['snippet']['publishedAt']),
-            }
-            rss.add_item(**story_data)
-
-        return rss.writeString('utf-8')
+    def fetch_youtube(self):
+        youtube_fetcher = YoutubeFetcher(self.feed, self.options)
+        return youtube_fetcher.fetch()
 
 
 class ProcessFeed:
diff --git a/utils/youtube_fetcher.py b/utils/youtube_fetcher.py
new file mode 100644
index 000000000..6dac327ba
--- /dev/null
+++ b/utils/youtube_fetcher.py
@@ -0,0 +1,218 @@
+import datetime
+from utils import json_functions as json
+import isodate
+import re
+import urllib.error
+import urllib.parse
+import urllib.request
+
+import dateutil.parser
+import requests
+from django.conf import settings
+from django.utils import feedgenerator
+from django.utils.html import linebreaks
+
+from apps.reader.models import UserSubscription
+from apps.social.models import MSocialServices
+from utils.story_functions import linkify
+from utils import log as logging
+
+
+class YoutubeFetcher:
+    def __init__(self, feed, options=None):
+        self.feed = feed
+        self.options = options or {}
+        self.address = self.feed.feed_address
+
+    def fetch(self):
+        username = self.extract_username(self.address)
+        channel_id = self.extract_channel_id(self.address)
+        list_id = self.extract_list_id(self.address)
+        video_ids = None
+
+        if channel_id:
+            video_ids, title, description = self.fetch_channel_videos(channel_id)
+            channel_url = "https://www.youtube.com/channel/%s" % channel_id
+        elif list_id:
+            video_ids, title, description = self.fetch_playlist_videos(list_id)
+            channel_url = "https://www.youtube.com/playlist?list=%s" % list_id
+        elif username:
+            video_ids, title, description = self.fetch_user_videos(username)
+            channel_url = "https://www.youtube.com/user/%s" % username
+
+        if not video_ids:
+            return
+
+        videos = self.fetch_videos(video_ids)
+        data = {}
+        if username:
+            data["title"] = f"{username}'s YouTube Videos"
+        else:
+            data["title"] = title
+        data["link"] = channel_url
+        data["description"] = description
+        data["lastBuildDate"] = datetime.datetime.utcnow()
+        data["generator"] = "NewsBlur YouTube API v3 Decrapifier - %s" % settings.NEWSBLUR_URL
+        data["docs"] = None
+        data["feed_url"] = self.address
+        rss = feedgenerator.Atom1Feed(**data)
+
+        for video in videos["items"]:
+            thumbnail = video["snippet"]["thumbnails"].get("maxres")
+            if not thumbnail:
+                thumbnail = video["snippet"]["thumbnails"].get("high")
+            if not thumbnail:
+                thumbnail = video["snippet"]["thumbnails"].get("medium")
+            duration_sec = isodate.parse_duration(video["contentDetails"]["duration"]).seconds
+            duration_min, seconds = divmod(duration_sec, 60)
+            hours, minutes = divmod(duration_min, 60)
+            if hours >= 1:
+                duration = "%s:%s:%s" % (
+                    hours,
+                    "{0:02d}".format(minutes),
+                    "{0:02d}".format(seconds),
+                )
+            else:
+                duration = "%s:%s" % (minutes, "{0:02d}".format(seconds))
+            content = """<div class="NB-youtube-player">
+                            <iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe>
+                         </div>
+                         <div class="NB-youtube-stats"><small>
+                             <b>From:</b> <a href="%s">%s</a><br />
+                             <b>Duration:</b> %s<br />
+                         </small></div><hr>
+                         <div class="NB-youtube-description">%s</div>
+                         <img src="%s" style="display:none" />""" % (
+                ("https://www.youtube.com/embed/" + video["id"]),
+                channel_url,
+                username,
+                duration,
+                linkify(linebreaks(video["snippet"]["description"])),
+                thumbnail["url"] if thumbnail else "",
+            )
+
+            link = "http://www.youtube.com/watch?v=%s" % video["id"]
+            story_data = {
+                "title": video["snippet"]["title"],
+                "link": link,
+                "description": content,
+                "author_name": username,
+                "categories": [],
+                "unique_id": "tag:youtube.com,2008:video:%s" % video["id"],
+                "pubdate": dateutil.parser.parse(video["snippet"]["publishedAt"]),
+            }
+            rss.add_item(**story_data)
+
+        return rss.writeString("utf-8")
+
+    def extract_username(self, url):
+        if "gdata.youtube.com" in url:
+            try:
+                username_groups = re.search(r"gdata.youtube.com/feeds/\w+/users/(\w+)/", url)
+                if not username_groups:
+                    return
+                return username_groups.group(1)
+            except IndexError:
+                return
+        elif "youtube.com/@" in url:
+            try:
+                return url.split("youtube.com/@")[1]
+            except IndexError:
+                return
+        elif "youtube.com/feeds/videos.xml?user=" in url:
+            try:
+                return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["user"][0]
+            except IndexError:
+                return
+        elif "youtube.com/user/" in url:
+            username = re.findall(r"youtube.com/user/([^/]+)", url)
+            if username:
+                return username[0]
+
+    def extract_channel_id(self, url):
+        if "youtube.com/feeds/videos.xml?channel_id=" in url:
+            try:
+                return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["channel_id"][0]
+            except (IndexError, KeyError):
+                return
+
+    def extract_list_id(self, url):
+        if "youtube.com/playlist" in url:
+            try:
+                return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["list"][0]
+            except IndexError:
+                return
+        elif "youtube.com/feeds/videos.xml?playlist_id" in url:
+            try:
+                return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)["playlist_id"][0]
+            except IndexError:
+                return
+
+    def fetch_videos(self, video_ids):
+        videos_json = requests.get(
+            "https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s"
+            % (",".join(video_ids), settings.YOUTUBE_API_KEY)
+        )
+        videos = json.decode(videos_json.content)
+        if "error" in videos:
+            logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos))
+            return
+        return videos
+
+    def fetch_channel_videos(self, channel_id):
+        channel_json = requests.get(
+            "https://www.googleapis.com/youtube/v3/channels?part=snippet,contentDetails&id=%s&key=%s"
+            % (channel_id, settings.YOUTUBE_API_KEY)
+        )
+        channel = json.decode(channel_json.content)
+        try:
+            title = channel["items"][0]["snippet"]["title"]
+            description = channel["items"][0]["snippet"]["description"]
+            uploads_list_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
+        except (IndexError, KeyError):
+            return
+
+        return self.fetch_playlist_videos(uploads_list_id, title, description)
+
+    def fetch_playlist_videos(self, list_id, title=None, description=None):
+        if not title and not description:
+            playlist_json = requests.get(
+                "https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s"
+                % (list_id, settings.YOUTUBE_API_KEY)
+            )
+            playlist = json.decode(playlist_json.content)
+            try:
+                title = playlist["items"][0]["snippet"]["title"]
+                description = playlist["items"][0]["snippet"]["description"]
+            except (IndexError, KeyError):
+                return None, None, None
+
+        playlist_json = requests.get(
+            "https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s"
+            % (list_id, settings.YOUTUBE_API_KEY)
+        )
+        playlist = json.decode(playlist_json.content)
+        try:
+            video_ids = [video["snippet"]["resourceId"]["videoId"] for video in playlist["items"]]
+        except (IndexError, KeyError):
+            return None, None, None
+
+        return video_ids, title, description
+
+    def fetch_user_videos(self, username, username_key="forUsername"):
+        channel_json = requests.get(
+            "https://www.googleapis.com/youtube/v3/channels?part=snippet,contentDetails&%s=%s&key=%s"
+            % (username_key, username, settings.YOUTUBE_API_KEY)
+        )
+        channel = json.decode(channel_json.content)
+        try:
+            title = channel["items"][0]["snippet"]["title"]
+            description = channel["items"][0]["snippet"]["description"]
+            uploads_list_id = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
+        except (IndexError, KeyError):
+            return
+
+        if not uploads_list_id and username_key == "forUsername":
+            return self.fetch_user_videos(username, username_key="forHandle")
+
+        return self.fetch_playlist_videos(uploads_list_id, title, description)